### Importing Libraries and Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("/Users/rohithsiddi/Desktop/Titanic Project/train.csv")

### Analysing data and removing unnecessary features

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.shape

(891, 12)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
df.pop("PassengerId")
df.pop("Ticket")
df.pop("Name")
df.pop("Cabin")

0       NaN
1       C85
2       NaN
3      C123
4       NaN
       ... 
886     NaN
887     B42
888     NaN
889    C148
890     NaN
Name: Cabin, Length: 891, dtype: object

In [7]:
survived_col=df.pop("Survived")

In [8]:
df.insert(7,"Survived",survived_col)

In [9]:
df["Parch"].unique()

array([0, 1, 2, 5, 3, 4, 6])

### Dealing with Missing Data

In [10]:
df.isnull().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
Survived      0
dtype: int64

In [11]:
df[df["Embarked"].isnull()]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
61,1,female,38.0,0,0,80.0,,1
829,1,female,62.0,0,0,80.0,,1


In [12]:
df["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [13]:
df["Embarked"].mode()

0    S
Name: Embarked, dtype: object

In [14]:
df["Embarked"].fillna("S",inplace=True)

In [15]:
df[df["Age"].isnull()]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
5,3,male,,0,0,8.4583,Q,0
17,2,male,,0,0,13.0000,S,1
19,3,female,,0,0,7.2250,C,1
26,3,male,,0,0,7.2250,C,0
28,3,female,,0,0,7.8792,Q,1
...,...,...,...,...,...,...,...,...
859,3,male,,0,0,7.2292,C,0
863,3,female,,8,2,69.5500,S,0
868,3,male,,0,0,9.5000,S,0
878,3,male,,0,0,7.8958,S,0


In [16]:
df["Age"].mean()

29.69911764705882

In [17]:
df["Age"].fillna(30,inplace=True)

In [18]:
df.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Survived    0
dtype: int64

In [19]:
df.corr()["Survived"].sort_values()

Pclass     -0.338481
Age        -0.070657
SibSp      -0.035322
Parch       0.081629
Fare        0.257307
Survived    1.000000
Name: Survived, dtype: float64

### Splitting Data

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X=df.drop("Survived",axis=1)

In [22]:
y=df["Survived"]

In [23]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

In [24]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 623 entries, 442 to 182
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    623 non-null    int64  
 1   Sex       623 non-null    object 
 2   Age       623 non-null    float64
 3   SibSp     623 non-null    int64  
 4   Parch     623 non-null    int64  
 5   Fare      623 non-null    float64
 6   Embarked  623 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 38.9+ KB


### Encoding Cat Features

In [25]:
X_train["Sex"]=pd.get_dummies(X_train["Sex"],drop_first=True)
X_test["Sex"]=pd.get_dummies(X_test["Sex"],drop_first=True)

In [26]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 623 entries, 442 to 182
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    623 non-null    int64  
 1   Sex       623 non-null    uint8  
 2   Age       623 non-null    float64
 3   SibSp     623 non-null    int64  
 4   Parch     623 non-null    int64  
 5   Fare      623 non-null    float64
 6   Embarked  623 non-null    object 
dtypes: float64(2), int64(3), object(1), uint8(1)
memory usage: 34.7+ KB


In [27]:
X_train_Embarked=pd.get_dummies(X_train["Embarked"],drop_first=True)
X_test_Embarked=pd.get_dummies(X_test["Embarked"],drop_first=True)

In [28]:
final_X_train=pd.concat([X_train,X_train_Embarked],axis=1)

In [29]:
final_X_test=pd.concat([X_test,X_test_Embarked],axis=1)

In [30]:
final_X_train.pop("Embarked")
final_X_test.pop("Embarked")

35     S
674    S
841    S
220    S
160    S
      ..
662    S
484    C
161    S
434    S
212    S
Name: Embarked, Length: 268, dtype: object

### Feature Scaling

In [31]:
final_X_train["Pclass"]=final_X_train["Pclass"].astype(str)

In [32]:
final_X_test["Pclass"]=final_X_test["Pclass"].astype(str)

In [33]:
X_train_Pclass=pd.get_dummies(X_train["Pclass"],drop_first=True)
X_test_Pclass=pd.get_dummies(X_test["Pclass"],drop_first=True)

In [34]:
final_X_train=pd.concat([final_X_train,X_train_Pclass],axis=1)
final_X_test=pd.concat([final_X_test,X_test_Pclass],axis=1)

In [35]:
final_X_train.pop("Pclass")
final_X_test.pop("Pclass")

35     1
674    2
841    2
220    3
160    3
      ..
662    1
484    1
161    2
434    1
212    3
Name: Pclass, Length: 268, dtype: object

In [36]:
scale_cols=["Age","SibSp","Parch","Fare"]

In [37]:
scale_final_X_train=final_X_train[scale_cols]

In [38]:
scale_final_X_test=final_X_test[scale_cols]

In [39]:
from sklearn.preprocessing import StandardScaler

In [40]:
ss=StandardScaler()

In [41]:
scale_final_X_train=pd.DataFrame(ss.fit_transform(scale_final_X_train))
scale_final_X_train.columns=scale_cols
scale_final_X_train.index=final_X_train.index

In [42]:
final_X_train.pop("Age")
final_X_train.pop("SibSp")
final_X_train.pop("Parch")
final_X_train.pop("Fare")

442      7.7750
621     52.5542
372      8.0500
620     14.4542
497     15.1000
         ...   
266     39.6875
417     13.0000
337    134.5000
790      7.7500
182     31.3875
Name: Fare, Length: 623, dtype: float64

In [43]:
final_X_train=pd.concat([final_X_train,scale_final_X_train],axis=1)

In [44]:
scale_final_X_test=pd.DataFrame(ss.transform(scale_final_X_test))
scale_final_X_test.columns=scale_cols
scale_final_X_test.index=final_X_test.index

In [45]:
final_X_test.pop("Age")
final_X_test.pop("SibSp")
final_X_test.pop("Parch")
final_X_test.pop("Fare")

35     52.0000
674     0.0000
841    10.5000
220     8.0500
160    16.1000
        ...   
662    25.5875
484    91.0792
161    15.7500
434    55.9000
212     7.2500
Name: Fare, Length: 268, dtype: float64

In [46]:
final_X_test=pd.concat([final_X_test,scale_final_X_test],axis=1)

## Random Forest

In [47]:
from sklearn.ensemble import RandomForestClassifier

In [48]:
rfc=RandomForestClassifier()

In [49]:
from sklearn.model_selection import GridSearchCV

In [50]:
final_X_train.shape

(623, 9)

In [51]:
n_estimators=[50,75,100,128,150,200,250]
max_features= [1,2,3,4,5,6,7]
bootstrap = [True]

In [52]:
param_grid = {'n_estimators':n_estimators,
             'max_features':max_features,
             'bootstrap':bootstrap}  

In [53]:
grid = GridSearchCV(rfc,param_grid)

In [54]:
grid.fit(final_X_train,y_train)

In [55]:
grid.best_params_

{'bootstrap': True, 'max_features': 4, 'n_estimators': 75}

In [56]:
predictions_grid=grid.predict(final_X_test)

In [57]:
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix,accuracy_score

In [58]:
accuracy_score(predictions_grid,y_test)

0.8246268656716418

## Logistic Regression

In [59]:
from sklearn.linear_model import LogisticRegression

In [60]:
lr=LogisticRegression()

In [61]:
lr.fit(final_X_train,y_train)

In [62]:
predictions_lr=lr.predict(final_X_test)

In [63]:
accuracy_score(predictions_lr,y_test)

0.8208955223880597

## KNN Classifier

In [64]:
from sklearn.neighbors import KNeighborsClassifier

In [65]:
k_values=list(range(1,20))

In [66]:
knn=KNeighborsClassifier()

In [67]:
param_grid={'n_neighbors': k_values}

In [68]:
knn=GridSearchCV(knn,param_grid,cv=5,scoring="accuracy")

In [69]:
knn.fit(final_X_train,y_train)

In [70]:
predictions_knn=knn.predict(final_X_test)

In [71]:
accuracy_score(predictions_knn,y_test)

0.8208955223880597

## SVM

In [72]:
from sklearn.svm import SVC

In [73]:
svc=SVC()

In [74]:
    param_grid={'kernel':['linear', 'poly', 'rbf', 'sigmoid'],"C":[0.5,0.75,1,1.5,2,2.5,3,4,5]}

In [75]:
svc=GridSearchCV(svc,param_grid,cv=5)

In [76]:
svc.fit(final_X_train,y_train)

In [77]:
svc.best_params_

{'C': 5, 'kernel': 'rbf'}

In [78]:
predictions=svc.predict(final_X_test)

In [79]:
accuracy_score(predictions,y_test)

0.8395522388059702

## Test Data

In [80]:
test=pd.read_csv("/Users/rohithsiddi/Desktop/Titanic Project/test.csv")

In [81]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [82]:
test.pop("PassengerId")
test.pop("Name")
test.pop("Cabin")
test.pop("Ticket")

0                  330911
1                  363272
2                  240276
3                  315154
4                 3101298
              ...        
413             A.5. 3236
414              PC 17758
415    SOTON/O.Q. 3101262
416                359309
417                  2668
Name: Ticket, Length: 418, dtype: object

In [83]:
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0000,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S
...,...,...,...,...,...,...,...
413,3,male,,0,0,8.0500,S
414,1,female,39.0,0,0,108.9000,C
415,3,male,38.5,0,0,7.2500,S
416,3,male,,0,0,8.0500,S


In [84]:
test.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [85]:
test["Age"].fillna(30,inplace=True)

In [86]:
test[test["Fare"].isnull()]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
152,3,male,60.5,0,0,,S


In [87]:
test["Fare"].groupby(test["Pclass"]).mean()

Pclass
1    94.280297
2    22.202104
3    12.459678
Name: Fare, dtype: float64

In [88]:
test["Fare"].fillna(12,inplace=True)

In [89]:
test.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [90]:
test["Sex"]=pd.get_dummies(test["Sex"],drop_first=True)

In [91]:
test_Embarked=pd.get_dummies(test["Embarked"],drop_first=True)

In [92]:
final_test=pd.concat([test,test_Embarked],axis=1)

In [93]:
final_test.pop("Embarked")

0      Q
1      S
2      Q
3      S
4      S
      ..
413    S
414    C
415    S
416    S
417    C
Name: Embarked, Length: 418, dtype: object

In [94]:
final_test["Pclass"]=final_test["Pclass"].astype(str)

In [95]:
test_Pclass=pd.get_dummies(test["Pclass"],drop_first=True)

In [96]:
final_test=pd.concat([final_test,test_Pclass],axis=1)

In [97]:
final_test.pop("Pclass")

0      3
1      3
2      2
3      3
4      3
      ..
413    3
414    1
415    3
416    3
417    3
Name: Pclass, Length: 418, dtype: object

In [98]:
scale_cols=["Age","SibSp","Parch","Fare"]

In [99]:
scale_final_test=final_test[scale_cols]

In [100]:
scale_final_test=pd.DataFrame(ss.transform(scale_final_test))
scale_final_test.columns=scale_cols

In [101]:
scale_final_test.index=final_test.index

In [102]:
final_test.pop("Age")
final_test.pop("SibSp")
final_test.pop("Parch")
final_test.pop("Fare")

0        7.8292
1        7.0000
2        9.6875
3        8.6625
4       12.2875
         ...   
413      8.0500
414    108.9000
415      7.2500
416      8.0500
417     22.3583
Name: Fare, Length: 418, dtype: float64

In [103]:
final_test=pd.concat([final_test,scale_final_test],axis=1)

## Comparing accuracy scores of all algorithms

### Random Forest Classifier - 82.4%
### Logistic Regression - 82.0%
### KNN - 82.1%
### SVM - 84.0%

*** Will vary on every iteration***

## Predictions using test data

In [104]:
final_prediction=svc.predict(final_test)

In [105]:
temp_data=pd.read_csv("/Users/rohithsiddi/Desktop/Titanic Project/test.csv")

In [106]:
submission = pd.DataFrame({'PassengerId':temp_data['PassengerId'],'Survived':final_prediction})
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [107]:
filename = 'Titanic Predictions.csv'

submission.to_csv(filename,index=False)

In [108]:
submission.shape

(418, 2)