In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn import metrics

from sklearn.model_selection import GridSearchCV

In [2]:
train_df = pd.read_csv(r"G:\Data science\Titanic\Data\train.csv")
test_df = pd.read_csv(r"G:\Data science\Titanic\Data\test.csv")

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
train_df.Cabin

0       NaN
1       C85
2       NaN
3      C123
4       NaN
       ... 
886     NaN
887     B42
888     NaN
889    C148
890     NaN
Name: Cabin, Length: 891, dtype: object

In [6]:
train_df.Age = train_df.Age.fillna(np.mean(train_df.Age))

In [7]:
train_df.Embarked

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [8]:
train_df = train_df.drop(['Cabin'] , axis=1)

In [9]:
train_df = train_df.dropna()

In [10]:
train_df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 83.3+ KB


In [12]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [13]:
train_df = train_df.drop(['Name']  , axis=1)

In [14]:
train_df = train_df.drop(['Ticket']  , axis=1)

In [15]:
train_df.Embarked.unique()

array(['S', 'C', 'Q'], dtype=object)

### 1 Encoding

In [16]:
encoder = LabelEncoder()

In [17]:
train_df.Sex = encoder.fit_transform(train_df.Sex)

In [18]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,S
1,2,1,1,0,38.0,1,0,71.2833,C
2,3,1,3,0,26.0,0,0,7.925,S
3,4,1,1,0,35.0,1,0,53.1,S
4,5,0,3,1,35.0,0,0,8.05,S


In [19]:
train_df.Embarked = encoder.fit_transform(train_df.Embarked)

### 2 Features and Target

In [20]:
features = train_df.drop(['Survived'], axis=1)

In [21]:
features.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,1,22.0,1,0,7.25,2
1,2,1,0,38.0,1,0,71.2833,0
2,3,3,0,26.0,0,0,7.925,2
3,4,1,0,35.0,1,0,53.1,2
4,5,3,1,35.0,0,0,8.05,2


In [22]:
target = train_df.Survived

In [23]:
X_train, X_val, Y_train, Y_val = train_test_split(features, target)

In [24]:
X_val.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
281,282,3,1,28.0,0,0,7.8542,2
62,63,1,1,45.0,1,0,83.475,2
246,247,3,0,25.0,0,0,7.775,2
594,595,2,1,37.0,1,0,26.0,2
340,341,2,1,2.0,1,1,26.0,2


### 3 ML Model Training and Prediction

In [25]:
dt_model = DecisionTreeClassifier()

In [26]:
dt_model.fit(X_train, Y_train)

DecisionTreeClassifier()

In [27]:
result = dt_model.predict(X_val)

In [28]:
type(result)

numpy.ndarray

In [29]:
type(Y_val)

pandas.core.series.Series

In [30]:
Y_val

281    0
62     0
246    0
594    0
340    1
      ..
513    1
495    0
87     0
644    1
235    0
Name: Survived, Length: 223, dtype: int64

In [31]:
metrics.accuracy_score(np.array(Y_val), result)

0.757847533632287

In [32]:
metrics.f1_score(np.array(Y_val), result)

0.6966292134831461

In [33]:
metrics.precision_score(np.array(Y_val), result)

0.6458333333333334

In [34]:
metrics.recall_score(np.array(Y_val), result)

0.7560975609756098

### 4 Random Forest

In [35]:
rd_model = RandomForestClassifier(random_state= 123)

In [36]:
rd_model_hyp = RandomForestClassifier(n_estimators= 50, max_depth= 8,criterion="gini")

In [37]:
param = {"n_estimators": [25, 20, 30, 35, 40, 45, 50],
        "max_depth" : [5,6,7,8,10], "criterion": ['gini', 'entropy']}

In [38]:
param

{'n_estimators': [25, 20, 30, 35, 40, 45, 50],
 'max_depth': [5, 6, 7, 8, 10],
 'criterion': ['gini', 'entropy']}

In [39]:
gd_model = GridSearchCV(estimator= rd_model, param_grid= param, n_jobs= -1, cv=3)

In [40]:
gd_model.fit(X_train, Y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=123),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 6, 7, 8, 10],
                         'n_estimators': [25, 20, 30, 35, 40, 45, 50]})

In [41]:
gd_model.best_params_

{'criterion': 'gini', 'max_depth': 6, 'n_estimators': 40}

In [42]:
rd_model.fit(X_train, Y_train)

RandomForestClassifier(random_state=123)

In [43]:
rd_model_hyp.fit(X_train, Y_train)

RandomForestClassifier(max_depth=8, n_estimators=50)

In [44]:
result_fyp = rd_model_hyp.predict(X_val)

In [45]:
result2 = rd_model.predict(X_val)

In [46]:
metrics.accuracy_score(np.array(Y_val), result2)

0.8071748878923767

In [47]:
metrics.accuracy_score(np.array(Y_val), result_fyp)

0.8295964125560538

In [48]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [49]:
test_df = test_df.drop(['Name' , 'Ticket' , 'Cabin' ] , axis=1)

In [50]:
#train_df.head()

In [51]:
test_df.isna().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Fare            1
Embarked        0
dtype: int64

In [52]:
test_df.Age = test_df.Age.fillna(np.mean(test_df.Age))

In [53]:
test_df.Fare = test_df.Fare.fillna(5)

In [54]:
test_df.Sex = encoder.fit_transform(test_df.Sex)

In [55]:
test_df.Embarked = encoder.fit_transform(test_df.Embarked)

In [56]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,1
1,893,3,0,47.0,1,0,7.0,2
2,894,2,1,62.0,0,0,9.6875,1
3,895,3,1,27.0,0,0,8.6625,2
4,896,3,0,22.0,1,1,12.2875,2


In [57]:
rd_model = RandomForestClassifier()

In [58]:
rd_model.fit(features , target)

RandomForestClassifier()

In [63]:
Predictions= rd_model.predict(test_df)

In [65]:
submission = pd.DataFrame({'PassengerId':test_df['PassengerId'],'Survived':Predictions})

In [66]:
#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
filename = 'titanic_submission.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: titanic_submission.csv
