In [1]:
# Description: This program predicts if a passenger will survive on the titanic


In [2]:
#Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
#Load the data
titanic = sns.load_dataset('titanic')

#print the first 10 rows of the data
titanic.head(10)

In [None]:
#Count the number of rows and coulmns in the dataset
titanic.shape


In [None]:
#Get Some Statistics
titanic.describe()


In [None]:
#Get the count of number of survivors
titanic['survived'].value_counts()

In [None]:
# Set the plot size
plt.figure(figsize=(8, 6))

# Create the countplot
sns.countplot(data=titanic, x='survived')

plt.show()


In [None]:
# Visualize the count of survivors for columns 'who', 'sex', 'pclass', 'sibsp', 'parch', 'embarked'
cols = ['who', 'sex', 'pclass', 'sibsp', 'parch', 'embarked']
n_rows = 2
n_cols = 3

# The subplot grid and figure size of each graph
fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols * 3.2, n_rows * 3.2))

for r in range(0, n_rows):
    for c in range(0, n_cols):

        i = r * n_cols + c  # index to go through the number of columns
        ax = axs[r][c]  # Show where to position each subplot

        # Create the countplot
        sns.countplot(x=cols[i], hue='survived', data=titanic, ax=ax)
        ax.set_title(cols[i])
        ax.legend(title='survived', loc='upper right')

plt.tight_layout()
plt.show()


In [None]:
#Look at the survival rate by sex
titanic.groupby('sex')[['survived']].mean()

In [None]:
#Look at survival rate by sex and class
titanic.pivot_table('survived', index = 'sex', columns = 'class')

In [None]:
#Look at survival rate by sex and class visually
titanic.pivot_table('survived', index = 'sex', columns = 'class').plot()

In [None]:
#Plot the survival rate of each class
sns.barplot(x = 'class', y = 'survived', data = titanic)

In [None]:
#Look at the survival rate by sex, age and class
age = pd.cut(titanic['age'],[0,18,80])
titanic.pivot_table('survived', ['sex', age], 'class' )

In [None]:
#Plot the prices paid of each class
plt.scatter(titanic['fare'], titanic['class'], color = 'purple', label = 'Passenger Paid')
plt.ylabel('Class')
plt.xlabel('Price / fare')
plt.title('Price Of Each Class')
plt.legend()
plt.show()

In [None]:
#Count the empty values in each column
titanic.isna().sum()

In [None]:
#Look at all of the values in each column & get a count
for val in titanic:
  print(titanic[val].value_counts())
  print()

In [21]:
#Drop the columns
titanic = titanic.drop(['deck', 'embark_town', 'alive', 'class', 'who', 'alone', 'adult_male'], axis = 1)

#Remove the rows with missing columns
titanic = titanic.dropna( subset = ['embarked', 'age'])

In [None]:
#Count the NEW number of rows and columns in the dataset
titanic.shape

In [None]:
#Look at the Data Types
titanic.dtypes

In [None]:
#Print the unique values in the column
print(titanic['sex'].unique())
print(titanic['embarked'].unique())

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

#Encode the Sex Column
titanic.iloc[:, 2] = labelencoder.fit_transform( titanic.iloc[:, 2].values)

#Encode the embarked Column
titanic.iloc[:, 7] = labelencoder.fit_transform( titanic.iloc[:, 7].values)

In [None]:
#Print the unique values in the column
print(titanic['sex'].unique())
print(titanic['embarked'].unique())

In [None]:
titanic.dtypes

In [29]:
#Split the data into independent 'X'and 'Y' variables
X = titanic.iloc[:, 1:8].values
Y = titanic.iloc[:, 0].values

In [30]:
#Split the dataset into 80% training 20% testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [31]:
#Scale the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [38]:
#Create a function with many machine learning models
def models(X_train, Y_train):

  #Use logistic regression
  from sklearn.linear_model import LogisticRegression
  log = LogisticRegression(random_state = 0)
  log.fit(X_train, Y_train)

  #Use KNeigbhor
  from sklearn.neighbors import KNeighborsClassifier
  knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
  knn.fit(X_train, Y_train)
  #Use Support Vector Classifier(linear kernel)
  from sklearn.svm import SVC
  svc_lin = SVC(kernel = 'linear', random_state = 0)
  svc_lin.fit(X_train, Y_train)

  #Use SVC(RBF kernel)
  from sklearn.svm import SVC
  svc_rbf = SVC(kernel = 'rbf', random_state = 0)
  svc_rbf.fit(X_train, Y_train)

  #Use GaussianNB
  from sklearn.naive_bayes import GaussianNB
  gauss =  GaussianNB()
  gauss.fit(X_train, Y_train)

  #Use Desicion Tree
  from sklearn.tree import DecisionTreeClassifier
  tree =  DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
  tree.fit(X_train, Y_train)

  #Use the RandomForestClassifier
  from sklearn.ensemble import RandomForestClassifier
  forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
  forest.fit(X_train, Y_train)

  #Print the training accuracy for each model
  print('[0]Logistic Regression Training Accuracy: ', log.score(X_train, Y_train))
  print('[1]K Neighbors Training Accuracy: ', knn.score(X_train, Y_train))
  print('[2]SVC Linear Training Accuracy: ', svc_lin.score(X_train, Y_train))
  print('[3]SVC RBF  Training Accuracy: ', svc_rbf.score(X_train, Y_train))
  print('[4]Gaussian NB Training Accuracy: ', gauss.score(X_train, Y_train))
  print('[5]Desicion Tree Training Accuracy: ', tree.score(X_train, Y_train))
  print('[6]Random Forest Training Accuracy: ', forest.score(X_train, Y_train))

  return log, knn, svc_lin, svc_rbf, gauss, tree, forest



In [None]:
#Get and train all the models
model = models(X_train, Y_train)

In [None]:
#Show the confusion matrix and accuracy for all the models on the test data
from sklearn.metrics import confusion_matrix

for i in range( len(model) ):
  cm = confusion_matrix(Y_test, model[i].predict(X_test))

  #Extract TN, FP, FN, TP
  TN, FP, FN, TP = confusion_matrix(Y_test, model[i].predict(X_test)).ravel()

  test_score = (TP + TN) / (TP + TN + FN + FP)

  print(cm)
  print('Model[{}] Testing Accuracy = "{}"'.format(i, test_score))
  print()



In [None]:
#Get feature importance
forest = model[6]
importances = pd.DataFrame({'feature': titanic.iloc[:, 1:8].columns, 'importance': np.round(forest.feature_importances_, 3)})
importances = importances.sort_values('importance', ascending = False).set_index('feature')
importances

In [None]:
#Visualize the importance
importances.plot.bar()

In [None]:
#print the prediction of the random forest classifier
pred = model[6].predict(X_test)
print(pred)

print()

#print the actual values
print(Y_test)

In [None]:
#individual survival prediction here, use table below to fill the values accordingly
##pclass        int64 #values -> 1 for first class, 2 for second class, 3 for third class
##sex           int64 #values -> 1 for male, 0 for female
##age         float64  #values -> any number provided should be in range of dataset
##sibsp         int64   #values -> can be typed from 0 to 8
##parch         int64   #Values -> can be typed from 0 to 6
##fare        float64    #Values -> any integer number
##embarked     int64   #values -> 0 for C, 2 for S, 1 for Q

#my own prediction

individual_survival = [[1, 0, 22, 0, 0, 150, 0 ]]



#print prediction

pred = model[6].predict(individual_survival)

print(pred)



if pred == 0:

  print("lol aap to mare gaye iss khel mai!!")

else:

  print('ha ha chal koi na bach gaya na ab paise nikal!')