In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Inspection

In [None]:
# Reading the training data
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
train.info()

Since there is lots of data, we will just remove any rows with null values. I will also remove the Cabin and Ticket columns because only a third of the Cabin column was filled in and any relevant details from the Ticket should be embeded in the other variables.

In [None]:
train = train.drop(['Cabin', 'Ticket'], axis=1)
train = train.dropna().reset_index(drop='index')
train.info()

In [None]:
train.head()

I will change the Sex and embarked columns to numbers so they can evaluated in the heatmap below.

In [None]:
train.Embarked.unique()

In [None]:
train['Sex'] = train['Sex'].apply(lambda x: 1 if x=='male' else 0)
train['Embarked'] = train['Embarked'].apply(lambda x:-1 if x=='S' else 0 if x=='C' else 1)
train.head()

In [None]:
sns.heatmap(train.corr(), annot=True, linewidths=0.5)
plt.show()

The variables that are most strongly correlated with survivability are Pclass (-0.29), Sex (-0.51), and Embarked(0.32)

In [None]:
train = train[['Survived', 'Pclass', 'Sex', 'Embarked']]
train.head()

# Making Predictions

## Multilinear Regression

In [None]:
# SKlearn imports
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [None]:
# Splitting up that data
data = train.drop(['Survived'], axis=1)
labels = train['Survived']
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.3,random_state=42)
# Model
model = LinearRegression()
model.fit(x_train, y_train)

mse = mean_squared_error(y_test, model.predict(x_test)) 
rmse = np.sqrt(mse) 
# Results
print('Score:',model.score(x_test, y_test))
print('Model Intercept:',model.intercept_)
print('Model Coef:',model.coef_)
print('RMSE:',rmse)

### Evaluation
The multilinear model isn't very good. It is only able to correctly predict 30% of the results. This isn't too surprising, multilinear regression tends to be good for making predicitons with continuous variables. Our data is very discrete, hence we shall try other models.

## K-Nearest Neighbours

In [None]:
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
# Splitting up that data
data = train.drop(['Survived'], axis=1)
labels = train['Survived']
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.4,random_state=42)
# Model
clf = KNeighborsClassifier(55) 
clf.fit(x_train, y_train)
# Results
knn_weight = clf.score(x_test, y_test)
print('Score:',clf.score(x_test, y_test))

### Evaluation
K-Nearest Neighbours was signficantly better than multilinear regression, correctly classifying 76% of the data.  

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
# Splitting up that data
data = train.drop(['Survived'], axis=1)
labels = train['Survived']
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.4,random_state=42)
# Model
gnb = GaussianNB()
gnb.fit(x_train, y_train)
nb_weight = gnb.score(x_test, y_test)
# Results
print('Score:',gnb.score(x_test, y_test))

### Evaluating
Naive Bayes is as good as KNN. Its score is only 0.6% lower, meaning both models would work just as well with predicting the test data.

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
# Splitting up that data
data = train.drop(['Survived'], axis=1)
labels = train['Survived']
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.4,random_state=42)
# Model
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
dt_weight = dtc.score(x_test, y_test)
# Results
print('Score:',dtc.score(x_test, y_test))

### Evaluation
The Decision tree scores 76% which is about the same as the other two classification algorithms.

## Neural Network
Neural networks are only really worth trying if you have a large amount of data. Fortunately we have 96,000 data points to work with, which should be more than enough. Here's an example from the Keras documentation I will be using as a guide: https://keras.io/examples/structured_data/structured_data_classification_from_scratch/#introduction.

In [None]:
from tensorflow import keras
from keras import layers
from keras import Input
from keras.models import Sequential
from keras.layers import Dense

In [None]:
# Building the model
model = Sequential()
model.add(Input(shape=(3,)))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(2, activation='relu'))
model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
            metrics=["accuracy"])
model.fit(x_train, y_train, epochs=1)

### Evaluation
Neural networks didn't perform as well as I thought they would. The model only trained on 2% of the data but I doubt training on more data would improve the model by much. 

## Conclusion
The best three models were KNN, Decision trees and Naive Bayes. I'll make my model a combination of these three.

In [None]:

def eval(x_t, knn=clf, nb=gnb, dt=dtc):
    weight_sum = knn_weight+nb_weight+dt_weight
    knn_pred = knn.predict(x_t)
    nb_pred = nb.predict(x_t)
    dt_pred = dt.predict(x_t)
    pred = []
    for i in range(len(x_t)):
        y = knn_pred[i]*knn_weight+nb_pred[i]*nb_weight+dt_pred[i]*dt_weight
        y = y / weight_sum
        if y > 0.5:
            pred.append(1)
        else:
            pred.append(0)
    return pred
    

# Test Data

In [None]:
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
test.info()

In [None]:
test = test[['Pclass', 'Sex', 'Embarked']]
test['Sex'] = test['Sex'].apply(lambda x: 1 if x=='male' else 0)
test.head()

There are 277 null values in the Embarked column. To fill these in I will make a new classifier that predicts where they embarked based on the other two variables

In [None]:
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

tcopy = test.copy()
tcopy = tcopy.dropna().reset_index(drop='index')
# Splitting up that data
data = tcopy.drop(['Embarked'], axis=1)
labels = tcopy['Embarked']
x_train1, x_test1, y_train1, y_test1 = train_test_split(data, labels, test_size=0.4,random_state=42)
# Model
knn_ev = KNeighborsClassifier(55)
gnb_ev = GaussianNB()
dt_ev = DecisionTreeClassifier()
knn_ev.fit(x_train1, y_train1)
gnb_ev.fit(x_train1, y_train1)
dt_ev.fit(x_train1, y_train1)
# Results
print('Score:',knn_ev.score(x_test1, y_test1))
print('Score:',gnb_ev.score(x_test1, y_test1))
print('Score:',dt_ev.score(x_test1, y_test1))

All the models are as good as eachother.

In [None]:
test_na = test[test['Embarked'].isna() == True].reset_index(drop='index')
test_na.info()

In [None]:
x_testna = test_na.drop(['Embarked'], axis=1)
emb_na = knn_ev.predict(x_testna)

In [None]:
embarked = []
tem = test['Embarked']
j = 0
for i in range(len(test['Sex'])):
    if tem[i] in ['S', 'C', 'Q']:
        embarked.append(tem[i])
    else:
        embarked.append(emb_na[j])
        j += 1

test['Embarked'] = embarked
test['Embarked'] = test['Embarked'].apply(lambda x:-1 if x=='S' else 0 if x=='C' else 1)

In [None]:
test.info()

Now all the null values have been filled in, its time to create the model

## Predictions

In [None]:
model_survived = eval(test)
model_survived[:10]

In [None]:
model_results = pd.DataFrame({'PassengerId':np.arange(10**5,10**5+len(knn_survived),1), 'Survived':model_survived})
model_results.to_csv('tpsapr21_pipe.csv', index=False)