In [None]:
#files in the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



**This note book comprises of the following in brief**

1. Data formatting
2. Exploratory Data Analysis
3. Model Selection
4. Metrics Evaluations
5. Model prediction and submission

In [None]:
#importation of data manipulation, plotting and grid formating Modules
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

# Importing Classifier Modules
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

#metrics evaluation Modules
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,classification_report

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test  = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train.shape, test.shape

In [None]:
train.head() #Taking a snapshot on the train table

In [None]:
test.head()

In [None]:
train.info() # code gives us some more detailed information about the dataset

In [None]:
test.info()

**From the above code we realise that we have both categorical variables and numerical ones in both the train and test dataframes.
we also realise that there are 4 variables with missing values which include: Age,Cabin,Fare and Embarked in the train and test dataframes.**

In [None]:
#brief summary on the numerical columns
train.describe()

**From the code above we can see that the average age of the people who boarded the ship was about 29 years although we also have some older people as old as 80 years.
The average fare was 32 in as much as we had tickets that were as expensive as 512.**

In [None]:
#brief summary on categorical variables
train.describe(include=['O'])

**Using the code above, we can note that the common port where most passengers embarked from was Southampton(S), there were also more male than female passengers on this ship.
We have 147 unique cabin string values.**

In [None]:
train.isnull().sum()

**We have 177 records with missing age, 2 records with missing embarked and the most records with missing values are from the Cabin variable.**

In [None]:
test.isnull().sum()

**In the test dataframe we have 1 record with a missing Fare, 86 records with missing Age and 327 records
with missing cabin entries.**

In [None]:
All_rows = train.shape[0] 

In [None]:
Survived = train[train['Survived']==1]
len(Survived)

In [None]:
Did_Not_Survive= train[train['Survived']==0]
len(Did_Not_Survive)

In [None]:
percent_of_survived = (len(Survived)/All_rows)*100
percent_of_Not_survive= (len(Did_Not_Survive)/All_rows)*100

In [None]:
print("The percentage of passengers who survived: %.2f%% "%percent_of_survived)
print("The percentage of passengers who did not survive: %.2f%%"%percent_of_Not_survive)

In [None]:
train.groupby('Sex').Sex.count()

Code above shows that we had more males than females.

In [None]:
train.groupby('Pclass').Pclass.count()

From the code above, we see that most passengers used the 3rd class
probably because it was less costly

In [None]:
train.groupby('Embarked').Embarked.count()

**We can see from the code above that most people boarded the ship from Southampton(S)**

In [None]:
train.groupby(['Sex','Pclass']).PassengerId.count()

**The code above shows that we had more males passengers in the 1st class than females. The highest number of females were in the 3rd class whereas the highest number of males were in the 3rd class. Generally, there were more males passengers across all the classes than females.**

In [None]:
train.groupby(['Sex','Survived']).PassengerId.count()

In [None]:
fig, ax = plt.subplots()

sns.catplot("Survived", hue="Sex", data=train, kind="count", 
            palette={'male':"Blue", 'female':"Green"}, ax=ax)

plt.close(1) # delete the extra figure we don't need

ax.legend(title="Gender")
plt.show()

**From the above code, we can see that there were more women who survived than men.
A nearly 3/4 of men died on the ship and 3/4 of women survived.**

In [None]:
train.groupby(['Pclass','Survived']).PassengerId.count()

In [None]:
fig, ax= plt.subplots()

sns.catplot("Survived", hue="Pclass", data=train, kind="count", 
            palette={1:"yellow", 2:"orange", 3:"red"},ax=ax)

ax.legend(title="Passenger Class")
ax.set_title("Pclass vs. Survival for Titanic Passengers");

plt.close(1)# we delete the extra figure created that we don't need

**From the above code, we realise that most survivers were from the upper class,non-survivers were from the third class.**

In [None]:
table =pd.crosstab(train['Parch'],train['Survived'])
table

**From the code above, we realise that the highest number of survivers did not have any family relation**

In [None]:
# Let's plot the survival class against the Frequency
labels = ['Did_Not_Survive','Survived']
classes = pd.value_counts(train['Survived'], sort = True)
classes.plot(kind = 'bar',rot=0)
plt.title("Survival class distribution")
plt.xticks(range(2), labels)
plt.xlabel("Class")
plt.ylabel("Frequency")

Generally we had much more count of non-survivers than those who survived from the plot above. 

In [None]:
sns.factorplot('Sex', 'Survived', hue='Pclass', size=8, aspect=2, data=train)

**From our plot above, we can see that women from class 1 have 99% chances of survival whereas men have close to 40% survival chances in the same class.
Women from the 3rd class have about 50% chances of survival whereas men have less than 20% survival chances in the same class.**

In [None]:
fig, ax = plt.subplots(figsize=(8,8))

ax.hist(train[train["Survived"]==1]["Age"], bins=15, alpha=0.8, color="blue", label="survived")
ax.hist(train[train["Survived"]==0]["Age"], bins=15, alpha=0.8, color="green", label="did not survive")

ax.set_xlabel("Age")
ax.set_ylabel("Count of passengers")

fig.suptitle("Age vs. Survival for Titanic Passengers")

ax.legend();

**The highest number of passengers who did not survive were between ages of 15 and 30 years.
The highest number of passengers who survived were between ages of 15 and 35 years old.
The histogram for graphs is generally skewed to the right.
We can see that we also had more elderly people who did not survive compared to the elderly who survived.**

In [None]:
fig, ax = plt.subplots(figsize=(8,8))

ax.hist(train[train["Survived"]==1]["Fare"], bins=15, alpha=0.5, color="blue", label="survived")
ax.hist(train[train["Survived"]==0]["Fare"], bins=15, alpha=0.5, color="red", label="did not survive")

ax.set_xlabel("Fare")
ax.set_ylabel("Count of passengers")

fig.suptitle("Fare vs. Survival for Titanic Passengers")

ax.legend();

**The histogram above shows that most passengers who did not survive had cheaper fares.
We see that passengers who survived had a little more costly fares than those who did not survive.
The distribution is skewed to the right with few people being able to aford costly fares as well as surviving.**

In [None]:
train.groupby(['SibSp','Survived']).PassengerId.count()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

sns.catplot("Survived", hue="SibSp", data=train, kind="count", 
            palette={1:"yellow", 2:"orange", 3:"red",4:'green',5:'brown',0:'cyan',8:'violet'}, ax=ax)

plt.close(1) # catplot creates an extra figure we don't need

ax.legend(title="spouses/sib aboard ")

**From the catplot we can see that we had more non-survivers who had no siblings or spouses.
It can be noted also that the highest number of survivers had no siblings or spouses.**

In [None]:
train.groupby(['Parch','Survived']).PassengerId.count()

In [None]:
fig, ax = plt.subplots(figsize=(10,6))

sns.catplot("Survived", hue="Parch", data=train, kind="count", 
            palette={1:"yellow", 2:"orange", 3:"red",4:'green',5:'brown',0:'cyan',6:'purple'}, ax=ax)

plt.close(1) # catplot creates an extra figure we don't need

ax.legend(title="parents/children aboard ")

**From the plot above, we can see that there we more non-survivers who did not travel with a parent or children compared to the survivers who did not travel with a parent or children.
However most survivers had one parent or a single child.**

In [None]:
# heat map of correlation of features
correlation_matrix = train.corr()
fig = plt.figure(figsize=(8,4))
sns.heatmap(correlation_matrix,vmax=0.8,annot=True) 
plt.show()

**From the correlation matrix, we can see some features that are positively correlated to the target column and some features are negatively correlated. we will use most of the columns here.**

FEATURE PREPROCESSING AND SELECTION.
We are going to do the following:
1. Preprocessing on some variables of high significance as seen in the matrix above.
2. Feature selection by droping those features that  won't be of great significance in our modelling process
3. We shall combine the train and test data before doing the preprocessing.

In [None]:
cols_to_drop= ['Name','Ticket','Cabin']

In [None]:
train = train.drop(columns=cols_to_drop,axis=1)
test = test.drop(columns=cols_to_drop,axis=1)
y_train = train['Survived']
train = train.drop(columns='Survived',axis=1)

In [None]:
# combining train and test dataset
data_combined = [train, test]

In [None]:
for dataset in data_combined:
    dataset['Age']= dataset['Age'].fillna(dataset['Age'].median())

In [None]:
train.isnull().sum()['Age']# shows no missing values in age

In [None]:
train.isnull().sum()['Age']# shows no missing values in age

In [None]:
#lets do some preprocessing on the fare column to fill in the nan value with median value
for dataset in data_combined:
    dataset['Fare']= dataset['Fare'].fillna(dataset['Fare'].median())

In [None]:
test.isnull().sum()['Fare'] # double check to see that there are no nan values in Fare

**Lets do some preprocessing on the emabarkation variable and fill in missing values with the common embarkation port. 'S' = Southampton**

In [None]:
for dataset in data_combined:
    dataset['Embarked']= dataset['Embarked'].fillna('S')

In [None]:
train.isnull().sum()['Embarked'] # shows that there are no more missing values in this column.

In [None]:
#converting Pclass to a categorical variable
for dataset in data_combined:
    dataset['Pclass']= dataset['Pclass'].astype('str')

In [None]:
train.head(2)

In [None]:
test.head(2)

In [None]:
#selection of categorical variables
cat_cols = [cname for cname in dataset.columns 
                    if  dataset[cname].dtype == "object"]

In [None]:
Train_cat_colsOH= pd.get_dummies(train[cat_cols])
Test_cat_colsOH= pd.get_dummies(test[cat_cols])

In [None]:
Train_cat_colsOH.head()

In [None]:
Test_cat_colsOH.head()

In [None]:
#Select numerical columns
num_cols = [cname for cname in dataset.columns 
            if dataset[cname].dtype in ['int64', 'float64']]

In [None]:
train_num_data = pd.DataFrame(train[num_cols])
test_num_data = pd.DataFrame(test[num_cols])

In [None]:
train =pd.concat([Train_cat_colsOH, train_num_data],axis=1) 
test =pd.concat([Test_cat_colsOH, test_num_data],axis=1) 

In [None]:
train.head()

In [None]:
test.head()

In [None]:
#lets drop the PassengerId column from the test and train features.
X_train = train.drop(columns='PassengerId',axis=1)
X_test = test.drop(columns='PassengerId',axis=1).copy()

In [None]:
X_train.shape , y_train.shape, X_test.shape

**Model evaluations on various models.**

In [None]:
LOG_R_clf = LogisticRegression()
LOG_R_clf.fit(X_train, y_train)
y_pred_LR = LOG_R_clf.predict(X_test)
LOG_R_score=LOG_R_clf.score(X_train,y_train)*100
print("The Logistic Regression train Accuracy = {:.2f}".format(LOG_R_score))

In [None]:
SVC_clf = SVC()
SVC_clf.fit(X_train, y_train)
y_pred_SVC = SVC_clf.predict(X_test)
SVC_score =SVC_clf.score(X_train,y_train)*100
print("The SVC train Accuracy = {:.2f}".format(SVC_score))

In [None]:
KNN_clf = KNeighborsClassifier(n_neighbors = 3)
KNN_clf.fit(X_train, y_train)
y_pred_KNN = KNN_clf.predict(X_test)
KNN_score =KNN_clf.score(X_train,y_train)*100
print("The KNeighbors Classifier train Accuracy = {:.2f}".format(KNN_score))

In [None]:
RF_clf = RandomForestClassifier(n_estimators=100)
RF_clf.fit(X_train, y_train)
prediction_train= RF_clf.predict(X_train)
prediction_test = RF_clf.predict(X_test)
RF_score=RF_clf.score(X_train,y_train)*100
print("The Random Forest train Accuracy = {:.2f}".format(RF_score))

In [None]:
GB_clf = GradientBoostingClassifier(n_estimators=100)
GB_clf.fit(X_train,y_train)
y_pred_GB = GB_clf.predict(X_test)
GB_score=GB_clf.score(X_train,y_train)*100
print("Gradient Boosting Classifier train Accuracy = {:.2f}".format(GB_score))

**Visualise the Train accuracy of the models to as to select one with a better accuracy performance.**

In [None]:
fig = plt.figure(figsize=(10,8))

# Dataframe to hold the results
model_comparison = pd.DataFrame({'model': ['LR_clf', 'SVC_clf',
                                           'KNN_clf', 'RF_clf',
                                            'GB_clf'],
                                 'Accuracy': [LOG_R_score,SVC_score,KNN_score,RF_score,GB_score ]})

# Horizontal bar chart of train accuracy
model_comparison.sort_values('Accuracy').plot(x = 'model', y = 'Accuracy', 
                                kind = 'barh',color = 'blue', edgecolor = 'black')
# Plot formatting
plt.ylabel('Model'); plt.yticks(size = 10); plt.xlabel('Accuracy Score'); plt.xticks(size = 10)
plt.title('Model Comparison on Train Accuracy Score', size = 14);

**We are going to consider the classifier with the highest accuracy, which is the Random Forest Classifier and take it's prediction on the test set as the prediction for the model.**

In [None]:
GBC_precison_score  = precision_score(y_train,prediction_train)
GBC_recall_score    = recall_score(y_train,prediction_train)
GBC_f1_score        = f1_score(y_train,prediction_train)

In [None]:
print(classification_report(y_train,prediction_train))

In [None]:
print("The gradient boosting precision_score = {:.2f}".format(GBC_precison_score))

In [None]:
print("The gradient boosting recall_score = {:.2f}".format(GBC_recall_score))

In [None]:
print("The gradient boosting f1_score = {:.2f}".format(GBC_f1_score))

In [None]:
#comfusion matrix plot
Label_1 = ['Predicted_Positive', 'Predicted_Negative']
Label_2 = [ 'True Did_Not_Survive','True Survived', ]
conf_matrix = confusion_matrix(y_train, prediction_train)
plt.figure(figsize=(10, 6))
sns.heatmap(conf_matrix, xticklabels=Label_1, yticklabels=Label_2, annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('')
plt.xlabel('Predicted class')
plt.show()

**According to our classification report, the positive class was considered as class 0 and the negative class as class 1. 
From the heatmap above, we can see that the model mis-classified 7 passengers as False Negatives(FN), implying that it classified them among passengers who survived and yet they did not survive in the true sense.
The model also mis-classified 11 passengers as False Positives (FP), implying that it classified them among those who did not survive and yet in real sense they survived**

In [None]:
#Create a submission file on Kaggle
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": prediction_test
    })

submission.to_csv('submission.csv', index=False)

In [None]:
submission.head(10).set_index('PassengerId')

**This notebook is still under improvements. All comments are welcome**