In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
print(type(df_train))

<class 'pandas.core.frame.DataFrame'>


df_train.isnull().sum().sort_values(ascending=False)

In [5]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age

In [6]:
df_train['Age'] = df_train[['Age','Pclass']].apply(impute_age,axis=1)

In [7]:
df_train['Embarked'] = df_train['Embarked'].fillna('S')

In [8]:
df_train.drop('Cabin',axis=1,inplace=True)

In [9]:
sex = pd.get_dummies(df_train['Sex'],drop_first=True)
embark = pd.get_dummies(df_train['Embarked'],drop_first=True)

In [10]:
df_train.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [11]:
df_train = pd.concat([df_train,sex,embark],axis=1)

In [12]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,1,0,3,22.0,1,0,7.25,1,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,1
4,5,0,3,35.0,0,0,8.05,1,0,1


In [13]:
X_train, X_test, y_train, y_test = train_test_split(df_train.drop(['Survived'],axis=1), 
                                                    df_train['Survived'], test_size=0.10, 
                                                    random_state=101)

In [14]:
rf = RandomForestClassifier(n_estimators=600)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=600,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [15]:
rf_pre = rf.predict(X_test)

In [16]:
print(confusion_matrix(y_test, rf_pre))

[[48  3]
 [14 25]]


In [17]:
print(classification_report(y_test,rf_pre))

              precision    recall  f1-score   support

           0       0.77      0.94      0.85        51
           1       0.89      0.64      0.75        39

    accuracy                           0.81        90
   macro avg       0.83      0.79      0.80        90
weighted avg       0.83      0.81      0.80        90



In [18]:
df_test.drop('Cabin',axis=1,inplace=True)

In [19]:
df_test['Age'] = df_test[['Age','Pclass']].apply(impute_age,axis=1)
sex_test = pd.get_dummies(df_test['Sex'],drop_first=True)
embark_test= pd.get_dummies(df_test['Embarked'],drop_first=True)
df_test.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)
df_test = pd.concat([df_test,sex_test,embark_test],axis=1)

In [20]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,892,3,34.5,0,0,7.8292,1,1,0
1,893,3,47.0,1,0,7.0,0,0,1
2,894,2,62.0,0,0,9.6875,1,1,0
3,895,3,27.0,0,0,8.6625,1,0,1
4,896,3,22.0,1,1,12.2875,0,0,1


In [21]:
rf = RandomForestClassifier(n_estimators=4000)

In [22]:
rf.fit(df_train.drop(['Survived'],axis=1),df_train['Survived'] )

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=4000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [23]:
mean_fare = int(round(df_train['Fare'].loc[~df_train['Fare'].isna()].mean()))
mean_fare

32

In [24]:
df_test.loc[(df_test['Fare'].isna()), 'Fare'] = mean_fare

In [25]:
df_train.isnull().sum().sort_values(ascending=False)

S              0
Q              0
male           0
Fare           0
Parch          0
SibSp          0
Age            0
Pclass         0
Survived       0
PassengerId    0
dtype: int64

In [26]:
test_prediction = rf.predict(df_test)

In [27]:
test_pred = pd.DataFrame(test_prediction, columns= ['Survived'])
new_test = pd.concat([df_test, test_pred], axis=1, join='inner')
df= new_test[['PassengerId' ,'Survived']]
df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


df.to_csv('predictions.csv' , index=False)