In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
if 'Survived' not in df_test.columns:
    df_test['Survived'] = 0 

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
def Preprocess(df_train, df_test):
    df = pd.concat([df_train, df_test], axis = 0)
    df = df.drop(['Name', 'Ticket'], axis = 1)
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Cabin'] = df['Cabin'].fillna('X000')
    df['Embarked'] = df['Embarked'].fillna('X')
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

    df['cabin_letter'] = df['Cabin'].str.extract(r'([a-zA-z]+)', expand = False)
    df['cabin_number'] = df['Cabin'].str.extract(r'(\d+)', expand = False)
    df = df.drop('Cabin', axis = 1)

    df = pd.get_dummies(df, columns = ['cabin_letter'], prefix = 'cabin')
    df = pd.get_dummies(df, columns = ['Embarked'], prefix = 'Embarked')
    df = pd.get_dummies(df, columns = ['Sex'], prefix = 'Sex')

    df = df.drop('cabin_X', axis = 1)
    df = df.drop('Embarked_X', axis = 1)

    df['cabin_number'] = df['cabin_number'].fillna(0)
    df['cabin_number'] = pd.to_numeric(df['cabin_number'])

    df['Pclass_bin_Fare'] = df['Fare'] // df['Pclass']
    df['Pclass_bin_sex'] = df['Pclass'] - df['Sex_female']

    df_train = df[:len(df_train)]
    df_test = df[len(df_train):]

    df_test = df_test.drop('Survived', axis = 1)

    return df_train, df_test

In [5]:
train_df, test_df = Preprocess(df_train, df_test)

In [6]:
train_df.corr()['Survived']

PassengerId       -0.005007
Survived           1.000000
Pclass            -0.338481
Age               -0.070323
SibSp             -0.035322
Parch              0.081629
Fare               0.257307
cabin_number       0.229756
cabin_A            0.022287
cabin_B            0.175095
cabin_C            0.114652
cabin_D            0.150716
cabin_E            0.145321
cabin_F            0.057935
cabin_G            0.016040
cabin_T           -0.026456
Embarked_C         0.168240
Embarked_Q         0.003650
Embarked_S        -0.155660
Sex_female         0.543351
Sex_male          -0.543351
Pclass_bin_Fare    0.267823
Pclass_bin_sex    -0.533994
Name: Survived, dtype: float64

In [7]:
X = train_df.drop('Survived', axis = 1)
y = train_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)
y_train = np.reshape(y_train, (-1, 1))

In [8]:
X_train.shape, y_train.shape

((712, 22), (712, 1))

In [9]:
model_1 = LogisticRegression()
model_1.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
y_pred = model_1.predict(X_test)

In [11]:
accuracy_score(y_test, y_pred)

0.776536312849162

In [12]:
model_2 = XGBClassifier(enable_categorical = True)
model_2.fit(X_train, y_train)

In [13]:
y_pred = model_2.predict(X_test)

accuracy_score(y_test, y_pred)

0.776536312849162

In [14]:
model_3 = RandomForestClassifier()
model_3.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [15]:
y_pred = model_3.predict(X_test)

accuracy_score(y_test, y_pred)

0.7932960893854749

In [16]:
pred = model_3.predict(test_df)

final = pd.DataFrame()
final['PassengerId'] = test_df['PassengerId']
final['Survived'] = pred

final.to_csv('output.csv', index = False)