In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('../../../datasets/titanic/train.csv')
df_test = pd.read_csv('../../../datasets/titanic/test.csv')

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
selected_columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] 

In [6]:
df_train[selected_columns].isna().sum()

Pclass      0
Age       177
SibSp       0
Parch       0
Fare        0
dtype: int64

In [7]:
df_test[selected_columns].isna().sum()

Pclass     0
Age       86
SibSp      0
Parch      0
Fare       1
dtype: int64

In [11]:
df_train['Age'].mean()

29.69911764705882

In [12]:
df_train['Age'].median()

28.0

# Fill missing data

In [13]:
df_train['Age'].fillna(df_train['Age'].median(), inplace=True)
df_test['Age'].fillna(df_train['Age'].median(), inplace=True)

df_train['Fare'].fillna(df_train['Fare'].median(), inplace=True)
df_test['Fare'].fillna(df_train['Fare'].median(), inplace=True)

df_train['Embarked'].fillna(df_train['Embarked'].mode().iloc[0], inplace=True)
df_test['Embarked'].fillna(df_train['Embarked'].mode().iloc[0], inplace=True)

df_train['Sex'].fillna(df_train['Sex'].mode().iloc[0], inplace=True)
df_test['Sex'].fillna(df_train['Sex'].mode().iloc[0], inplace=True)


# Feature Encoding

In [14]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [15]:
df_train['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [16]:
df_train['Sex_enc'] = df_train['Sex'].map({'male':0, 'female':1})
df_test['Sex_enc'] = df_test['Sex'].map({'male':0, 'female':1})

df_train['Embarked_enc'] = df_train['Embarked'].map({'S':0, 'C':1, 'Q':2})
df_test['Embarked_enc'] = df_test['Embarked'].map({'S':0, 'C':1, 'Q':2})

In [18]:
selected_columns.append('Sex_enc')
selected_columns.append('Embarked_enc')

In [19]:
selected_columns

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_enc', 'Embarked_enc']

# Lets try for is_cabin

In [35]:
df_train['is_cabin'] = df_train['Cabin'].isna().apply(lambda X:int(not X))
df_test['is_cabin'] = df_test['Cabin'].isna().apply(lambda X:int(not X))

In [56]:
selected_columns.append('is_cabin')

# Lets try for name

In [45]:
df_train['Title'] = df_train['Name'].apply(lambda name:name.split('.')[0].split(' ')[1])
df_test['Title'] = df_test['Name'].apply(lambda name:name.split('.')[0].split(' ')[1])

In [51]:
titles = list(df_train['Title'].value_counts().iloc[:4].index)
titles

['Mr', 'Miss', 'Mrs', 'Master']

In [52]:
df_train['Title'] = df_train['Title'].apply(lambda X:X if X in titles else 'other')
df_test['Title'] = df_test['Title'].apply(lambda X:X if X in titles else 'other')

In [53]:
df_train['Title'].value_counts()

Mr        502
Miss      179
Mrs       121
other      49
Master     40
Name: Title, dtype: int64

In [54]:
df_train['Title_enc'] = df_train['Title'].map({'Mr':0, 'Miss':1, 'Mrs':2, 'other':3, 'Master':4})
df_test['Title_enc'] = df_test['Title'].map({'Mr':0, 'Miss':1, 'Mrs':2, 'other':3, 'Master':4})

In [57]:
selected_columns.append('Title_enc')

# Model Bulding

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [59]:
params = {'n_estimators':[25,50,75,100,125,150,175,200],
          'max_depth':[1,2,3,4,5],
          'min_samples_leaf':[2,4,6,8]}
random_cv = RandomizedSearchCV(RandomForestClassifier(), params, cv=5)
random_cv.fit(df_train[selected_columns], df_train['Survived'])

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions={'max_depth': [1, 2, 3, 4, 5],
                                        'min_samples_leaf': [2, 4, 6, 8],
                                        'n_estimators': [25, 50, 75, 100, 125,
                                                         150, 175, 200]})

In [60]:
random_cv.best_estimator_

RandomForestClassifier(max_depth=4, min_samples_leaf=4, n_estimators=50)

In [61]:
model = random_cv.best_estimator_

In [62]:
yp = model.predict(df_test[selected_columns])

In [63]:
df_test['Survived'] = yp

In [64]:
df_test[['PassengerId','Survived']].to_csv('sub4.csv', index=False)