## 1. Import the Libraries

In [1]:
import numpy as numpy
import pandas as pd
import matplotlib as plt

## 2. Read the training and test data into respective variables

In [2]:
train = pd.read_csv("./trainT.csv")
test  = pd.read_csv("./testT.csv")

## 3. Take a paeak at the dataset

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 4. Determine the variables

In [6]:
df = train[['Survived','Pclass', 'Sex', 'Age', 'Fare']]

## 5. Encoding Gender Values 0 and 1

In [7]:
df['Sex'] = df['Sex'].apply(lambda x: 1 if x=="male" else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sex'] = df['Sex'].apply(lambda x: 1 if x=="male" else 0)


In [8]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.0,7.25
1,1,1,0,38.0,71.2833
2,1,3,0,26.0,7.925
3,1,1,0,35.0,53.1
4,0,3,1,35.0,8.05


## 6. handling Missing Values - Data Imputation

In [9]:
df.info

<bound method DataFrame.info of      Survived  Pclass  Sex   Age     Fare
0           0       3    1  22.0   7.2500
1           1       1    0  38.0  71.2833
2           1       3    0  26.0   7.9250
3           1       1    0  35.0  53.1000
4           0       3    1  35.0   8.0500
..        ...     ...  ...   ...      ...
886         0       2    1  27.0  13.0000
887         1       1    0  19.0  30.0000
888         0       3    0   NaN  23.4500
889         1       1    1  26.0  30.0000
890         0       3    1  32.0   7.7500

[891 rows x 5 columns]>

In [10]:
df["Age"] = df["Age"].fillna(df["Age"].median())
# Why median - It is robust to outliers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Age"] = df["Age"].fillna(df["Age"].median())


In [11]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.0,7.25
1,1,1,0,38.0,71.2833
2,1,3,0,26.0,7.925
3,1,1,0,35.0,53.1
4,0,3,1,35.0,8.05


## 7 Set the predictor(x) and response(y) variables

In [13]:
X = df.drop("Survived", axis=1)
Y = df["Survived"]

## 8. Split the data into training and test sets

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 25)
#TODO: Cross Validation

## 9. Logistic Regression Model

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
logit = LogisticRegression()
logit.fit(X_train,Y_train)

LogisticRegression()

## 10. Predicting Y values

In [19]:
Y_pred = logit.predict(X_test)

## 11. Confusion Matrix

In [20]:
from sklearn.metrics import confusion_matrix

In [21]:
conf_mat = confusion_matrix(Y_test, Y_pred)
conf_mat

array([[136,  29],
       [ 31,  72]])

## 12. Accuracy Score

In [22]:
from sklearn.metrics import accuracy_score

In [23]:
accuracy_score(Y_test, Y_pred)

0.7761194029850746

## 13. CLassification Report

In [25]:
from sklearn.metrics import classification_report

In [26]:
report = classification_report(Y_test, Y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.81      0.82      0.82       165
           1       0.71      0.70      0.71       103

    accuracy                           0.78       268
   macro avg       0.76      0.76      0.76       268
weighted avg       0.78      0.78      0.78       268



## 14. Naive Bayes

In [27]:
from sklearn.naive_bayes import GaussianNB

In [29]:
gnb = GaussianNB()
gnb.fit(X_train, Y_train)


GaussianNB()

## 15. Predicting Y Values

In [32]:
Y_pred_nb = gnb.predict(X_test)

## 16. Confusion Matrix

In [33]:
conf_mat_nb = confusion_matrix(Y_test, Y_pred)
conf_mat_nb

array([[130,  35],
       [ 27,  76]])

## 17. Accuracy Score

In [34]:
accuracy_score(Y_test, Y_pred_nb)

0.7686567164179104

## 18. Classification Report

In [36]:
report_nb = classification_report(Y_test, Y_pred_nb)
print(report_nb)

              precision    recall  f1-score   support

           0       0.83      0.79      0.81       165
           1       0.68      0.74      0.71       103

    accuracy                           0.77       268
   macro avg       0.76      0.76      0.76       268
weighted avg       0.77      0.77      0.77       268



## 19. Visual Comparison between Logistic Regression and Naive Bayes

In [39]:
plt.pyplot.plot(X_test,Y_test, X_test,Y_pred, X_test,Y_pred_nb)
plt.pyplot.show()