In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv("train.csv")
# df_test = pd.read_csv("test.csv")
#drop some columns in both training and testing dataset and call the funciton again
df = df.drop(['Cabin','Ticket'], axis = 1)
# # df_test = df_test.drop(['Cabin','Ticket'], axis = 1)
#replace null values in age with the median values
df["Age"].fillna(df["Age"].median(), inplace = True)
# # df_test["Age"].fillna(df_test["Age"].median(), inplace = True)
df["Embarked"].fillna("S",inplace = True)
# # df_test["Fare"].fillna(df_test["Fare"].median(),inplace = True)

In [3]:
total_survived_females = df[df.Sex == "female"]['Survived'].sum()
total_survived_males = df[df.Sex == "male"]["Survived"].sum()
print("Total no. of people survived : ", total_survived_females + total_survived_males)
print("Proportion of Females survived out of total survived people: ", (total_survived_females / (total_survived_females + total_survived_males)))
print("Proportion of Males survived out of total survived people: ", (total_survived_males / (total_survived_females + total_survived_males)))

Total no. of people survived :  342
Proportion of Females survived out of total survived people:  0.6812865497076024
Proportion of Males survived out of total survived people:  0.31871345029239767


In [4]:
#converting "Sex" and "Embarked" columns into Numerical values 
df.Sex[df.Sex == 'male'] = 0
df.Sex[df.Sex == 'female'] = 1
df['Sex'] = df['Sex'].astype('int64')

df.loc[df["Embarked"] == "S", "Embarked"] = 0
df.loc[df["Embarked"] == "C", "Embarked"] = 1
df.loc[df["Embarked"] == "Q", "Embarked"] = 2
df['Embarked'] = df['Embarked'].astype('int64')

# df_test.loc[df_test["Sex"] == "male", "Sex"] = 0
# # df_test.loc[df_test["Sex"] == "female", "Sex"] = 1
# # df_test['Sex'] = df_test['Sex'].astype('int64')

# # df_test.loc[df_test["Embarked"] == "S", "Embarked"] = 0
# # df_test.loc[df_test["Embarked"] == "C", "Embarked"] = 1
# # df_test.loc[df_test["Embarked"] == "Q", "Embarked"] = 2
# # df_test['Embarked'] = df_test['Embarked'].astype('int64')

In [5]:
#doing some more feature engineering 
df["FamSize"] = df["SibSp"] + df["Parch"] + 1
# df_test["FamSize"] = df_test["SibSp"] + df_test["Parch"] + 1
df["IsAlone"] = df.FamSize.apply(lambda x: 1 if x == 1 else 0)
# # df_test["IsAlone"] = df_test.FamSize.apply(lambda x: 1 if x == 1 else 0)
#Encoding values of Name , 
for name in df["Name"]:
    df["Title"] = df["Name"].str.extract("([A-Za-z]+)\.",expand=True)
    
# for name in df_test["Name"]:
    # # df_test["Title"] = df_test["Name"].str.extract("([A-Za-z]+)\.",expand=True)
    
title_replacements = {"Mlle": "Other", "Major": "Other", "Col": "Other", "Sir": "Other", "Don": "Other", "Mme": "Other",
          "Jonkheer": "Other", "Lady": "Other", "Capt": "Other", "Countess": "Other", "Ms": "Other", "Dona": "Other", "Rev": "Other", "Dr": "Other"}

df.replace({"Title": title_replacements}, inplace=True)
# df_test.replace({"Title": title_replacements}, inplace=True)

df.loc[df["Title"] == "Miss", "Title"] = 0
df.loc[df["Title"] == "Mr", "Title"] = 1
df.loc[df["Title"] == "Mrs", "Title"] = 2
df.loc[df["Title"] == "Master", "Title"] = 3
df.loc[df["Title"] == "Other", "Title"] = 4
df['Title'] = df['Title'].astype('int64')


# # df_test.loc[df_test["Title"] == "Miss", "Title"] = 0
# # df_test.loc[df_test["Title"] == "Mr", "Title"] = 1
# # df_test.loc[df_test["Title"] == "Mrs", "Title"] = 2
# # df_test.loc[df_test["Title"] == "Master", "Title"] = 3
# # df_test.loc[df_test["Title"] == "Other", "Title"] = 4
# # df_test['Title'] = df_test['Title'].astype('int64')

print(pd.isnull(df).sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
FamSize        0
IsAlone        0
Title          0
dtype: int64


In [16]:
df['IsAlone'].value_counts()

1    537
0    354
Name: IsAlone, dtype: int64

In [6]:
df.head(3)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamSize,IsAlone,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,0,2,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,1,2,0,2
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,0,1,1,0


In [13]:
df['Age'].dtype

dtype('float64')

In [7]:
from sklearn.model_selection import train_test_split 
features = ["Pclass", "Sex", "Age" , "Fare", "Embarked", "Title","FamSize", "IsAlone"]
x = df[features]
y = df['Survived']
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state=42)

In [8]:
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
# model = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
model = RandomForestClassifier()
model.fit(xtrain, ytrain)
pred_xtest = model.predict(xtest)
print("Accuracy Score: ",accuracy_score(ytest, pred_xtest))

Accuracy Score:  0.8547486033519553


#### Improving RandomForestClassifier Accuracy 

In [10]:
from sklearn.model_selection import GridSearchCV
rfc = RandomForestClassifier()
parameters = {"n_estimators": [4,8,15], 
              "criterion": ["gini", "entropy"],
              "max_features": ["sqrt", "log2"], 
              "max_depth": [3,10], 
              "min_samples_split": [5, 10],
              "min_samples_leaf": [5,10]
             }

grid_cv = GridSearchCV(rfc, parameters, scoring = make_scorer(accuracy_score))
grid_cv = grid_cv.fit(xtrain, ytrain)
print("Best parameters for RandomForestClassifier is:")
print(grid_cv.best_estimator_)

Best parameters for RandomForestClassifier is:
RandomForestClassifier(max_depth=10, max_features='log2', min_samples_leaf=10,
                       min_samples_split=5, n_estimators=4)


In [11]:
rfc = grid_cv.best_estimator_
rfc.fit(xtrain, ytrain)
good_test_pred = rfc.predict(xtest)
print("This is accuracy score of improved rfc: ", accuracy_score(ytest, good_test_pred))

This is accuracy score of improved rfc:  0.7877094972067039


#### Ignoring improved rfc, using old rfc (coz of better accuracy)

In [12]:
import pickle 
pickle.dump(model,open('model.pkl','wb'))