In [None]:
#import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

First import the relevant libraries into Python.

In [None]:
train_df = pd.read_csv('../input/titanicdataset-traincsv/train.csv')
test_df = pd.read_csv('../input/titanic-test-set/test.csv')

Import the train and test data sets into different dataframes.

In [None]:
corrmatrix = train_df.corr()
corrmatrix

In [None]:
train_df.drop("Name", axis =1, inplace=True)
train_df.drop("Cabin", axis =1, inplace=True)
train_df.drop("Fare", axis =1, inplace=True)
train_df.drop("PassengerId", axis =1, inplace=True)
train_df.drop("Ticket", axis =1, inplace=True)


test_df.drop("Ticket", axis =1, inplace=True)
test_df.drop("PassengerId", axis =1, inplace=True)
test_df.drop("Name", axis =1, inplace=True)
test_df.drop("Cabin", axis =1, inplace=True)
test_df.drop("Fare", axis =1, inplace=True)

I decided to drop the above columns from the datasets as they had no influence on the final prediction. For example, knowing the name, cabin number, fare, passengerID or ticket doesnot effect the prediction. I did decide to keep the passengerclass (pclass) column becuase it tells me which class the passenger belongs to. This is helpful because different classes are assigned different levels. For example, higher class passengers will have the upper deck where as lower class passengers will have the lower deck. So, in the advent of an accident, it is more likely that people in the lower class will be hurt first.

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
source = train_df.drop('Survived', axis=1)
target = train_df['Survived']

source.shape, target.shape

Creating the source and target dataframes. I dropped the label for source as we are going to use to train the model.

In [None]:
source.head()

In [None]:
target.head()

In [None]:
df_cat = source.drop(['Age', 'Parch', 'SibSp', 'Pclass'], axis =1)

cat_attribs = list(df_cat)
cat_attribs

Now I need to convert categorical attributes into numberical values, therefore I created a new dataframe where I only stored categorical attributes.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer



cat_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
])

full_pipeline = ColumnTransformer([
    ("cat", cat_pipeline, cat_attribs),
])

source_final = full_pipeline.fit_transform(source)


Then I created a new pipeline where I was able to one hot encode the data and merge it back into the source_final dataframe.

In [None]:
source_final.shape

In [None]:
target1 = target.to_numpy()

In [None]:
target.shape

In [None]:
target = target1.reshape(1,-1)

In [None]:
target = target.transpose()
target.shape

In [None]:
#test_final = full_pipeline.transform(test_df)

In [None]:
from sklearn.model_selection import train_test_split
source_train, source_test, target_train, target_test = train_test_split(source_final, target, test_size = 0.25, random_state=0)

Now lets create train test split as it will help when I calculate accuracy and other scores.

In [None]:
target_test.shape

Linear Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score

In [None]:
LR = LogisticRegression()

I applied logistic regression, but if you want you can also apply other algorithms.

In [None]:
LR.fit(source_train, target_train)

In [None]:
LRpred = LR.predict(source_test)

In [None]:
LR.score(source_test, LRpred)

In [None]:
LRpred

As you can see the model worked.

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target_test,LRpred)

As you can see, this model corrected classifed 115 True Positives and 59 True Negatives the rest were false predictions. Below you will find all the different accuracy scores, you can try to improve the score by trying different algorithms like XGboost, KNN and SVC, you can also apply crossvalidation to find the best hyperparameters.

The confusion matrix tells us the true positives and true negatives that the model predicted.

In [None]:
accuracy_score(target_test,LRpred)

In [None]:
from sklearn.metrics import recall_score
recall_score(target_test,LRpred, average=None)

In [None]:
from sklearn.metrics import precision_score
precision_score(target_test,LRpred, average=None