In [35]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
if 'Survived' not in df_test.columns: 
    df_test['Survived'] = 0

In [20]:
def Preprocess(df_train, df_test): 
    df = pd.concat([df_train, df_test], axis = 0)
    df = df.drop(['Name','Ticket'], axis=1)

    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Cabin'] = df['Cabin'].fillna('X000')
    df['Embarked'] = df['Embarked'].fillna('X')
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

    df['Cabin_letter'] = df['Cabin'].str.extract(r'([a-zA-Z]+)', expand = False)  
    df['Cabin_number'] = df['Cabin'].str.extract(r'(\d+)', expand = False)

    df = df.drop(['Cabin'], axis = 1)

    df = pd.get_dummies(df, columns=['Cabin_letter'],prefix = 'Cabin')
    df = pd.get_dummies(df, columns = ['Embarked'], prefix = 'Embarked')
    df = pd.get_dummies(df, columns = ['Sex'], prefix = 'Sex')

    df = df.drop(['Cabin_X'], axis = 1)
    df = df.drop(['Embarked_X'], axis = 1)

    df['Cabin_number'] = df['Cabin_number'].fillna(0)
    df['Cabin_number'] = pd.to_numeric( df['Cabin_number'])

    df['Pclass_bin_Fare'] = df['Fare'] // df['Pclass']
    df['Pclass_bin_sex'] = df['Pclass'] - df['Sex_female']
    
    
    df_train = df[:len(df_train)]
    df_test = df[len(df_train):]

    df_test = df_test.drop("Survived", axis=1)
    return df_train, df_test

In [21]:
train_df, test_df = Preprocess(df_train, df_test)

In [22]:
train_df.corr()['Survived']

PassengerId       -0.005007
Survived           1.000000
Pclass            -0.338481
Age               -0.070323
SibSp             -0.035322
Parch              0.081629
Fare               0.257307
Cabin_number       0.229756
Cabin_A            0.022287
Cabin_B            0.175095
Cabin_C            0.114652
Cabin_D            0.150716
Cabin_E            0.145321
Cabin_F            0.057935
Cabin_G            0.016040
Cabin_T           -0.026456
Embarked_C         0.168240
Embarked_Q         0.003650
Embarked_S        -0.155660
Sex_female         0.543351
Sex_male          -0.543351
Pclass_bin_Fare    0.267823
Pclass_bin_sex    -0.533994
Name: Survived, dtype: float64

In [24]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 0 to 890
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PassengerId      891 non-null    int64  
 1   Survived         891 non-null    int64  
 2   Pclass           891 non-null    int64  
 3   Age              891 non-null    float64
 4   SibSp            891 non-null    int64  
 5   Parch            891 non-null    int64  
 6   Fare             891 non-null    float64
 7   Cabin_number     891 non-null    int64  
 8   Cabin_A          891 non-null    bool   
 9   Cabin_B          891 non-null    bool   
 10  Cabin_C          891 non-null    bool   
 11  Cabin_D          891 non-null    bool   
 12  Cabin_E          891 non-null    bool   
 13  Cabin_F          891 non-null    bool   
 14  Cabin_G          891 non-null    bool   
 15  Cabin_T          891 non-null    bool   
 16  Embarked_C       891 non-null    bool   
 17  Embarked_Q       891 

In [25]:
X = train_df.drop('Survived', axis = 1)
y = train_df['Survived']

In [26]:
X

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Cabin_number,Cabin_A,Cabin_B,Cabin_C,...,Cabin_F,Cabin_G,Cabin_T,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,Pclass_bin_Fare,Pclass_bin_sex
0,1,3,22.000000,1,0,7.2500,0,False,False,False,...,False,False,False,False,False,True,False,True,2.0,3
1,2,1,38.000000,1,0,71.2833,85,False,False,True,...,False,False,False,True,False,False,True,False,71.0,0
2,3,3,26.000000,0,0,7.9250,0,False,False,False,...,False,False,False,False,False,True,True,False,2.0,2
3,4,1,35.000000,1,0,53.1000,123,False,False,True,...,False,False,False,False,False,True,True,False,53.0,0
4,5,3,35.000000,0,0,8.0500,0,False,False,False,...,False,False,False,False,False,True,False,True,2.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,27.000000,0,0,13.0000,0,False,False,False,...,False,False,False,False,False,True,False,True,6.0,2
887,888,1,19.000000,0,0,30.0000,42,False,True,False,...,False,False,False,False,False,True,True,False,30.0,0
888,889,3,29.881138,1,2,23.4500,0,False,False,False,...,False,False,False,False,False,True,True,False,7.0,2
889,890,1,26.000000,0,0,30.0000,148,False,False,True,...,False,False,False,True,False,False,False,True,30.0,1


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size = 0.2)
y_train = np.reshape(y_train, (-1,1))

In [28]:
X_train.shape, y_train.shape

((712, 22), (712, 1))

In [29]:
model_1 = LogisticRegression()
model_1.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
y_pred = model_1.predict(X_test)

In [31]:
accuracy_score(y_test, y_pred)

0.7988826815642458

In [32]:
model_2 = XGBClassifier()
model_2.fit(X_train, y_train)

In [33]:
y_pred = model_2.predict(X_test)

In [34]:
accuracy_score(y_test, y_pred)

0.8379888268156425

In [36]:
model_3 = RandomForestClassifier() 
model_3.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [39]:
y_pred = model_3.predict(X_test)

In [40]:
accuracy_score(y_test, y_pred)

0.8435754189944135

In [41]:
pred = model_3.predict(test_df)

final = pd.DataFrame()

final['PassengerID'] = test_df['PassengerId']
final['Survived'] = pred

final.to_csv('output.csv', index = False)