In [38]:
# Import modules
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
import re
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Figures inline and set visualization style
%matplotlib inline
# sns.set()


In [39]:
# Import data
df_train = pd.read_csv('/home/gajendra/Github/MachineLearning_intro/Titanic_data/train.csv')
df_test = pd.read_csv('/home/gajendra/Github/MachineLearning_intro/Titanic_data/test.csv')

In [40]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [41]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [42]:
# Store target variable of training data in a safe place
survived_train = df_train.Survived

# Concatenate training and test sets
data = pd.concat([df_train.drop(['Survived'], axis=1), df_test])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1046 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1308 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 122.7+ KB


In [43]:
# Impute missing numerical variables
data['Age'] = data.Age.fillna(data.Age.median())
data['Fare'] = data.Fare.fillna(data.Fare.median())

# Check out info of data
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1309 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1309 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 122.7+ KB


In [44]:
data=data.drop("Cabin", axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1309 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1309 non-null float64
Embarked       1307 non-null object
dtypes: float64(2), int64(4), object(4)
memory usage: 112.5+ KB


In [45]:
object_columns_df = data.select_dtypes(include=['object'])
print(object_columns_df.iloc[0])


Name        Braund, Mr. Owen Harris
Sex                            male
Ticket                    A/5 21171
Embarked                          S
Name: 0, dtype: object


In [46]:
object_columns_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 4 columns):
Name        1309 non-null object
Sex         1309 non-null object
Ticket      1309 non-null object
Embarked    1307 non-null object
dtypes: object(4)
memory usage: 51.1+ KB


In [47]:
cols = ['Name', 'Sex','Ticket', 'Embarked']
for name in cols:
    print(name,':')
    print(object_columns_df[name].value_counts(),'\n')


Name :
Kelly, Mr. James                                           2
Connolly, Miss. Kate                                       2
Johnson, Mr. Alfred                                        1
Moutal, Mr. Rahamin Haim                                   1
Hays, Miss. Margaret Bechstein                             1
Kalvik, Mr. Johannes Halvorsen                             1
Silverthorne, Mr. Spencer Victor                           1
Hassab, Mr. Hammad                                         1
Lennon, Mr. Denis                                          1
Fischer, Mr. Eberhard Thelander                            1
McCarthy, Miss. Catherine Katie""                          1
Baclini, Miss. Eugenie                                     1
Smith, Mr. James Clinch                                    1
Becker, Miss. Ruth Elizabeth                               1
Rheims, Mr. George Alexander Lucien                        1
Markun, Mr. Johann                                         1
Coutts, Master. E

In [48]:
drop_cols=['Name', 'Ticket']
data=data.drop(drop_cols, axis=1)

In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 8 columns):
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
Age            1309 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Fare           1309 non-null float64
Embarked       1307 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 92.0+ KB


In [50]:
nominal_columns = ["Embarked", "Sex"]
dummy_df = pd.get_dummies(data[nominal_columns])
data = pd.concat([data, dummy_df], axis=1)
data = data.drop(nominal_columns, axis=1)

In [51]:
data.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,1,3,22.0,1,0,7.25,0,0,1,0,1
1,2,1,38.0,1,0,71.2833,1,0,0,1,0
2,3,3,26.0,0,0,7.925,0,0,1,1,0
3,4,1,35.0,1,0,53.1,0,0,1,1,0
4,5,3,35.0,0,0,8.05,0,0,1,0,1


In [52]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Age            1309 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Fare           1309 non-null float64
Embarked_C     1309 non-null uint8
Embarked_Q     1309 non-null uint8
Embarked_S     1309 non-null uint8
Sex_female     1309 non-null uint8
Sex_male       1309 non-null uint8
dtypes: float64(2), int64(4), uint8(5)
memory usage: 78.0 KB


In [53]:
data=data.drop(['Sex_female'], axis=1)

In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Age            1309 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Fare           1309 non-null float64
Embarked_C     1309 non-null uint8
Embarked_Q     1309 non-null uint8
Embarked_S     1309 non-null uint8
Sex_male       1309 non-null uint8
dtypes: float64(2), int64(4), uint8(4)
memory usage: 76.7 KB


In [55]:
for name in data:
    print(name,':')
    print(data[name].value_counts(),'\n')

PassengerId :
1309    1
449     1
431     1
432     1
433     1
434     1
435     1
436     1
437     1
438     1
439     1
440     1
441     1
442     1
443     1
444     1
445     1
446     1
447     1
430     1
429     1
428     1
418     1
411     1
412     1
413     1
414     1
415     1
416     1
417     1
       ..
890     1
880     1
891     1
892     1
893     1
894     1
895     1
896     1
897     1
881     1
879     1
861     1
869     1
862     1
863     1
864     1
865     1
866     1
867     1
868     1
870     1
878     1
871     1
872     1
873     1
874     1
875     1
876     1
877     1
1       1
Name: PassengerId, Length: 1309, dtype: int64 

Pclass :
3    709
1    323
2    277
Name: Pclass, dtype: int64 

Age :
28.00    295
24.00     47
22.00     43
21.00     41
30.00     40
18.00     39
25.00     34
36.00     31
26.00     30
29.00     30
27.00     30
19.00     29
23.00     26
32.00     24
20.00     23
35.00     23
31.00     23
33.00     21
45.00     21
17.00     

In [56]:
data=data.drop(['PassengerId'], axis=1)

In [57]:
data.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_male
0,3,22.0,1,0,7.25,0,0,1,1
1,1,38.0,1,0,71.2833,1,0,0,0
2,3,26.0,0,0,7.925,0,0,1,0
3,1,35.0,1,0,53.1,0,0,1,0
4,3,35.0,0,0,8.05,0,0,1,1


In [58]:
data_train = data.iloc[:891]
data_test = data.iloc[891:]

In [59]:
X = data_train.values
test = data_test.values
y = survived_train.values


In [60]:
# Instantiate model and fit to data
clf = tree.DecisionTreeClassifier(max_depth=3)
clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [61]:
# Make predictions and store in 'Survived' column of df_test
Y_pred = clf.predict(test)
df_test['Survived'] = Y_pred

In [62]:
df_test[['PassengerId', 'Survived']].to_csv('/home/gajendra/Github/MachineLearning_intro/Titanic_data/predictions/2nd_solution.csv', index=False)