In [30]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [31]:

df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [32]:
def age_append(cols):
    age = cols[0]
    pclass= cols[1]
    
    
    if pd.isnull(age):
        if pclass ==1:
            return 38.23
        if pclass == 2:
            return 29.87
        if pclass == 3:
            return 25.14
    else:
        return age

In [33]:
def fam_add(cols):
    SibSp = cols[0] 
    Parch = cols[1]
    return (SibSp + Parch)

In [34]:
def transformdf(df):
    df['age'] = df[['Age','Pclass']].apply(age_append, axis = 1)
    df.drop(['Age'], inplace = True, axis = 1)
    dummyem = pd.get_dummies(df['Embarked'], drop_first= True)
    df = pd.concat([df, dummyem], axis = 1)
   

    df['family'] = df[['SibSp', 'Parch']].apply(fam_add, axis = 1)
    df.drop('Cabin', axis =1, inplace = True)
    mf = pd.get_dummies(df['Sex'], drop_first= True )
    df = pd.concat([df, mf], axis = 1)
    df.drop(['Embarked','Name','Ticket','PassengerId','Sex'], inplace = True, axis = 1)
    
    return df

In [35]:
df = transformdf(df)

In [36]:
df

Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,age,Q,S,family,male
0,0,3,1,0,7.2500,22.00,0,1,1,1
1,1,1,1,0,71.2833,38.00,0,0,1,0
2,1,3,0,0,7.9250,26.00,0,1,0,0
3,1,1,1,0,53.1000,35.00,0,1,1,0
4,0,3,0,0,8.0500,35.00,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,0,13.0000,27.00,0,1,0,1
887,1,1,0,0,30.0000,19.00,0,1,0,0
888,0,3,1,2,23.4500,25.14,0,1,3,0
889,1,1,0,0,30.0000,26.00,0,0,0,1


In [107]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 10)

In [108]:
from sklearn.model_selection import train_test_split

In [109]:
X = df.drop(['Survived'], axis = 1)
y = df['Survived']

In [110]:
 X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [111]:
X_train

Unnamed: 0,Pclass,SibSp,Parch,Fare,age,Q,S,family,male
6,1,0,0,51.8625,54.00,0,1,0,1
718,3,0,0,15.5000,25.14,1,0,0,1
685,2,1,2,41.5792,25.00,0,0,3,1
73,3,1,0,14.4542,26.00,0,0,1,1
882,3,0,0,10.5167,22.00,0,1,0,0
...,...,...,...,...,...,...,...,...,...
106,3,0,0,7.6500,21.00,0,1,0,0
270,1,0,0,31.0000,38.23,0,1,0,1
860,3,2,0,14.1083,41.00,0,1,2,1
435,1,1,2,120.0000,14.00,0,1,3,0


In [112]:
rfc.fit(X_train,y_train)

RandomForestClassifier(n_estimators=10)

In [113]:
pred = rfc.predict(X_test)

In [114]:
from sklearn.metrics import classification_report, confusion_matrix

In [115]:
print(confusion_matrix(y_test,pred))

[[154  21]
 [ 34  86]]


In [116]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       175
           1       0.80      0.72      0.76       120

    accuracy                           0.81       295
   macro avg       0.81      0.80      0.80       295
weighted avg       0.81      0.81      0.81       295



In [50]:
from sklearn.model_selection import GridSearchCV

In [52]:
params = {'n_estimators':(600,1000,200)}

In [53]:
clf = GridSearchCV(rfc, params)

In [54]:
clf.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestClassifier(n_estimators=600),
             param_grid={'n_estimators': (600, 1000, 200)})

In [59]:
clf.cv_results_

{'mean_fit_time': array([0.68268728, 1.12827921, 0.23120022]),
 'std_fit_time': array([0.05125543, 0.0476313 , 0.00960044]),
 'mean_score_time': array([0.04504175, 0.06959143, 0.01602006]),
 'std_score_time': array([0.00444515, 0.00196737, 0.00253033]),
 'param_n_estimators': masked_array(data=[600, 1000, 200],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 600},
  {'n_estimators': 1000},
  {'n_estimators': 200}],
 'split0_test_score': array([0.80833333, 0.81666667, 0.81666667]),
 'split1_test_score': array([0.74789916, 0.73109244, 0.73109244]),
 'split2_test_score': array([0.78151261, 0.77310924, 0.75630252]),
 'split3_test_score': array([0.81512605, 0.79831933, 0.82352941]),
 'split4_test_score': array([0.83193277, 0.83193277, 0.82352941]),
 'mean_test_score': array([0.79696078, 0.79022409, 0.79022409]),
 'std_test_score': array([0.02942122, 0.03547908, 0.03889711]),
 'rank_test_score': array([1, 2, 3])}

In [60]:
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 600,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [117]:
test = pd.read_csv('test.csv')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [119]:
test = transformdf(test)

In [120]:
test

Unnamed: 0,Pclass,SibSp,Parch,Fare,age,Q,S,family,male
0,3,0,0,7.8292,34.50,1,0,0,1
1,3,1,0,7.0000,47.00,0,1,1,0
2,2,0,0,9.6875,62.00,1,0,0,1
3,3,0,0,8.6625,27.00,0,1,0,1
4,3,1,1,12.2875,22.00,0,1,2,0
...,...,...,...,...,...,...,...,...,...
413,3,0,0,8.0500,25.14,0,1,0,1
414,1,0,0,108.9000,39.00,0,0,0,0
415,3,0,0,7.2500,38.50,0,1,0,1
416,3,0,0,8.0500,25.14,0,1,0,1


In [138]:
predictions = rfc.predict(test.fillna(0))

In [139]:
test2 = pd.read_csv('test.csv')

In [140]:
pred2= pd.DataFrame(predictions, columns = ['Survived'])

In [141]:
answer = pd.concat([test2['PassengerId'],pred2], axis = 1)

In [146]:
answer.set_index('PassengerId')

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0
...,...
1305,0
1306,1
1307,0
1308,0


In [144]:
answer

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [148]:
answer.to_csv('answertitanic.csv', index=False)