In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style
import warnings 
warnings.filterwarnings('ignore')

In [None]:
train_data = pd.read_csv("train.csv")
#test_data = pd.read_csv("test.csv")

In [None]:
train_data.info()

In [None]:
print(train_data.shape)

In [None]:
train_data.describe()

In [None]:
train_data.describe(include=['O'])

In [None]:
train_data.head(10)

In [None]:
# Evaluate the missing Percentage of the data in the feature space. 
total = train_data.isnull().sum().sort_values(ascending=False)
percent_1 = (round(train_data.isnull().sum()/train_data.isnull().count()*100).sort_values(ascending=False))
missing_data = pd.concat([total, percent_1], axis=1, keys=['Total', 'Percentage (%)'])
missing_data.head(5)

In [None]:
print("Average Fare: ", round(train_data['Fare'].mean(), 3))

In [None]:
print("Average Traveller Age: ", round(train_data['Age'].mean()))

In [None]:
sns.set(rc={'figure.figsize':(7,7)})
sns.countplot(train_data['Survived'])
plt.title('Survived or Not Survived')

In [None]:
first_class_count= (train_data['Pclass'] == 1).sum()
second_class_count= (train_data['Pclass'] == 2).sum()
third_class_count= (train_data['Pclass'] == 3).sum()
print(" Number of Passangers in First Class: ", first_class_count)
print(" Number of Passangers in Second Class: ",second_class_count)
print(" Number of Passangers in Third Class: ", third_class_count)

In [None]:
labels=['First Class', 'Second Class', 'Third Class']
sizes=['first_class_count', 'second_class_count', 'third_class_count']
plt.pie(sizes, labels=labels, autopct= '%1.2f%%')
plt.axis('equal')
plt.title('Class Distribution')
plt.show()

In [None]:
sns.barplot(x='Pclass', y='Survived', data=train_data)
plt.title('Class based Survival Rate')

In [None]:
# Age based survival in each class of Pclass
grid_data = sns.FacetGrid(train_data, col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid_data.map(plt.hist, 'Age', alpha=.5, bins=20, color='orange')
grid_data.add_legend()

In [None]:
sns.countplot(train_data['Pclass'], hue=train_data['Survived'], color='green')
plt.title('Survived and not survived within three class')
plt.xlabel('Three Classes')
plt.ylabel('Count')
plt.show()

In [None]:
# Density estimation of survived/not survived in each passanger class (1,2, and 3)
est=sns.kdeplot(train_data.Pclass[train_data.Survived == 0], label='Not Survived', color='pink')
est=sns.kdeplot(train_data.Pclass[train_data.Survived == 1], label='Survived', color='red')
plt.title('Survived wrt Pclass ')
plt.xlabel('Passanger classes')
plt.ylabel('Count')
plt.show()

In [None]:
# Analysis the numrical values 
plt.rcParams['figure.figsize'] = (20, 10)
fig, axes = plt.subplots(nrows = 2, ncols = 2)
num_features = ['Age', 'SibSp', 'Parch', 'Fare']
xaxes = num_features
yaxes = ['Counts', 'Counts', 'Counts', 'Counts']
axes = axes.ravel()
for idx, ax in enumerate(axes):
    ax.hist(train_data[num_features[idx]].dropna(), bins=40, color='lightblue')
    ax.set_xlabel(xaxes[idx], fontsize=20)
    ax.set_ylabel(yaxes[idx], fontsize=20)
    ax.tick_params(axis='both', labelsize=15)

In [None]:
plt.rcParams['figure.figsize'] = (15, 7)

# import the package for visulization of the correlation
from yellowbrick.features import Rank2D
X = train_data[num_features].as_matrix()
visualizer = Rank2D(features=num_features, algorithm='pearson')
visualizer.fit(X)                
visualizer.transform(X)            
visualizer.poof()    

In [None]:
# Define function to plot bar chat for various feature set in the dataset
def Bar_Chart(features):
    survived=train_data[train_data['Survived']==1][features].value_counts()
    notsurvived=train_data[train_data['Survived']==0][features].value_counts()
    df=pd.DataFrame([survived, notsurvived])
    df.index = ['survived', 'notsurvived']
    df.plot(kind='bar', stacked=True, figsize=(11,6))

In [None]:
Bar_Chart('Embarked')

In [None]:
Bar_Chart('Sex')

In [None]:
survived = 'survived'
not_survived = 'not survived'
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(10, 4))

female = train_data[train_data['Sex']=='female']
male = train_data[train_data['Sex']=='male']

ax = sns.distplot(female[female['Survived']==1].Age.dropna(), bins=20, label = survived, ax = axes[0], kde =False)
ax = sns.distplot(female[female['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[0], kde =False)
ax.legend()
ax.set_title('Female')

ax = sns.distplot(male[male['Survived']==1].Age.dropna(), bins=20, label = survived, ax = axes[1], kde = False)
ax = sns.distplot(male[male['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[1], kde = False)
ax.legend()
ax.set_title('Male')

In [None]:
train_data.isnull().sum()

Treating Missing Values in the dataset 

In [None]:
def fill_na_median(data, inplace=True):
    return data.fillna(data.median(), inplace=inplace)
fill_na_median(train_data['Age'])
train_data['Age'].describe()

In [None]:
def fill_na_most(data, inplace=True):
    return data.fillna('S', inplace=inplace)
fill_na_most(train_data['Embarked'])
train_data['Embarked'].describe()

In [None]:
def log_transformation(data):
    return data.apply(np.log1p)
train_data['Fare_log1p'] = log_transformation(train_data['Fare'])

train_data.describe()

In [None]:
plt.hist(train_data['Fare_log1p'], bins=40)
plt.xlabel('Fare_log1p', fontsize=15)
plt.ylabel('Counts', fontsize=15)

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_f = ['Pclass', 'Sex', "Embarked"]
train_data_cat = train_data[cat_f]
train_data_cat = train_data_cat.replace({'Pclass': {1: 'First', 2: 'Second', 3: 'Third'}})
train_data_cat_dummies = pd.get_dummies(train_data_cat)
train_data_cat_dummies.head()

In [None]:
features_model = ['Age', 'SibSp', 'Parch', 'Fare_log1p']
feature = pd.concat([train_data[features_model], train_data_cat_dummies], axis=1)
target= train_data.replace({'Survived': {1: 'Survived', 0: 'Not_survived'}})['Survived']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(feature, target, random_state = 42,test_size=0.30)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, classification_report, accuracy_score, roc_curve, auc

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
svm= SVC()
adaboost= AdaBoostClassifier()
gnb=GaussianNB()
rf=RandomForestClassifier(random_state=42)

In [None]:
knn.fit(x_train, y_train)  
y_pred = knn.predict(x_test)  
acc_knn = round(knn.score(x_train, y_train) * 100, 2)

In [None]:
svm.fit(x_train, y_train)  
y_pred = svm.predict(x_test)  
acc_svm = round(svm.score(x_train, y_train) * 100, 2)

In [None]:
adaboost.fit(x_train, y_train)  
y_pred = adaboost.predict(x_test)  
acc_ada = round(adaboost.score(x_train, y_train) * 100, 2)

In [None]:
gnb.fit(x_train, y_train)  
y_pred = gnb.predict(x_test)  
acc_gnb = round(gnb.score(x_train, y_train) * 100, 2)


In [None]:
rf.fit(x_train, y_train)  
y_pred = rf.predict(x_test)  
acc_rf = round(rf.score(x_train, y_train) * 100, 2)

In [None]:
results = pd.DataFrame({
    'Model': ['KNN', 'Support Vector Machine', 
              'AdaBoost', 'Naive Bayes', 'Random Forest'],
    'Score': [acc_knn, acc_svm, acc_ada, acc_gnb, acc_rf]})

result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head()

In [None]:
print (np.mean(cross_val_score(rf, x_train, y_train, cv=10)))

In [None]:
from yellowbrick.classifier import ClassificationReport
mu_viz = ClassificationReport(RandomForestClassifier(), cmap='GnBu')
mu_viz.fit(x_train, y_train)
mu_viz.score(x_test, y_test)
mu_viz.show()

In [None]:
# Save the model
from sklearn.externals import joblib 
joblib.dump(rf, 'Titanic_RandomForest.pkl')