In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt


In [104]:
#import the sets of titanic data
titanic_test = pd.read_csv('test.csv')
titanic_train = pd.read_csv('train.csv')

In [105]:
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [106]:
titanic_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [107]:
titanic_train.isnull().sum().sort_values(ascending=False).head()

Cabin          687
Age            177
Embarked         2
PassengerId      0
Survived         0
dtype: int64

In [108]:
titanic_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [109]:
# Take a look at the outcome variable: 'Age'
print(titanic_train['Age'].value_counts())

24.00    30
22.00    27
18.00    26
28.00    25
19.00    25
         ..
55.50     1
74.00     1
0.92      1
70.50     1
12.00     1
Name: Age, Length: 88, dtype: int64


In [110]:
# Assign X as a DataFrame of features and y as a Series of the outcome variable
X = titanic_train.drop('Age', 1)
X = titanic_train.drop('Name', 1)
y = titanic_train.Age

In [111]:
print(X.head())

   PassengerId  Survived  Pclass     Sex   Age  SibSp  Parch  \
0            1         0       3    male  22.0      1      0   
1            2         1       1  female  38.0      1      0   
2            3         1       3  female  26.0      0      0   
3            4         1       1  female  35.0      1      0   
4            5         0       3    male  35.0      0      0   

             Ticket     Fare Cabin Embarked  
0         A/5 21171   7.2500   NaN        S  
1          PC 17599  71.2833   C85        C  
2  STON/O2. 3101282   7.9250   NaN        S  
3            113803  53.1000  C123        S  
4            373450   8.0500   NaN        S  


In [112]:
print(y.head())

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64


In [114]:
# Decide which categorical variables to use in model
for col_name in X.columns:
    if X[col_name].dtypes == 'object':
        unique_cat = len(X[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} unique categories".format(col_name=col_name, unique_cat=unique_cat))

Feature 'Sex' has 2 unique categories
Feature 'Ticket' has 681 unique categories
Feature 'Cabin' has 148 unique categories
Feature 'Embarked' has 4 unique categories


In [117]:
# Create a list of features to dummy
#todummy_list = ['PassengerId', 'Survived', 'Pclass', 'Ticket', 'SibSp', 'Fare', 'Sex',]
todummy_list = ['Sex', 'Cabin', 'Embarked']

In [118]:
# Function to dummy all the categorical variables used for modeling
def dummy_df(df, todummy_list):
    for x in todummy_list:
        dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False)
        df = df.drop(x, 1)
        df = pd.concat([df, dummies], axis=1)
    return df

In [100]:
X = dummy_df(X, todummy_list)
print(X.head(5))

    Age  Parch  PassengerId_1  PassengerId_2  PassengerId_3  PassengerId_4  \
0  22.0      0              1              0              0              0   
1  38.0      0              0              1              0              0   
2  26.0      0              0              0              1              0   
3  35.0      0              0              0              0              1   
4  35.0      0              0              0              0              0   

   PassengerId_5  PassengerId_6  PassengerId_7  PassengerId_8  ...  \
0              0              0              0              0  ...   
1              0              0              0              0  ...   
2              0              0              0              0  ...   
3              0              0              0              0  ...   
4              1              0              0              0  ...   

   Cabin_F G73  Cabin_F2  Cabin_F33  Cabin_F38  Cabin_F4  Cabin_G6  Cabin_T  \
0            0         0       

In [134]:
X = titanic_train.drop(['Age', 'Cabin', 'Embarked', 'Name'], axis = 1)
X.isnull().sum().sort_values(ascending=False).head()

PassengerId    0
Survived       0
Pclass         0
Sex            0
SibSp          0
dtype: int64

In [135]:
# Import the train_test_split function 
from sklearn.model_selection import train_test_split
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

In [136]:
#creating a target object, and call it y
y = titanic_train.Age
#assign the input variable (features) to a X
feature_columns = ['PassengerId','Survived','Pclass', 'SibSp', 'Fare']
X = titanic_train[feature_columns]

In [137]:
# Specify Model
titanic_model = DecisionTreeRegressor()
# Fit Model
titanic_model.fit(X, y)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').