In [None]:
#Import the standard libraries used for data science
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load cleaned Titanic dataset

fname = ? # ? "titanic_cleaned_1807.csv"
# To remove the index column saved as the first column in the dataset while saving dataframe using to_csv()
df = pd.read_csv(fname,usecols=range(1,12)) 

In [None]:
df.? # ? shape

In [None]:
df.isnull().sum()

In [None]:
df.? # ? info()

In [None]:
#Select only required columns and separate dependent & independent variables

cols_needed = ['Age','Family_Cnt','New_fare','Sex_male','Pclass_2','Pclass_3','Embarked_Q','Embarked_S','Cabin_ind_1']

X = df[?] # independent variables i.e. ? cols_needed

Y = df['Survived'] # dependent variables

In [None]:
X.head()

In [None]:
Y.head()

In [None]:
#Check for multi-collinearity among independent variables
cormat = X.corr()
round(cormat,2)

In [None]:
# 'import' - imports an entire code library, 'from import' - imports a specific member or members of the library.

# Split the dataset into training and testing set
from ?.model_selection import train_test_split # ? sklearn

X_train_data, X_test_data, Y_train_data, Y_test_data = train_test_split(X, Y, test_size=?, random_state=420) # ? test_size= 0.2 for 80-20 split of dataset

In [None]:
#Check the shape of train and test data

print(X_train_data.shape)
print(X_test_data.shape)
print(Y_train_data.shape)
print(Y_test_data.shape)

# To check if there exists a class imbalance
#Y_train_data.value_counts()
#Y_test_data.value_counts()


# Steps while using sklearn for modeling

In [None]:
# Import the Logistic Regression model from the sklearn (also known as Scikit-learn) library  

from sklearn.linear_model import LogisticRegression

In [None]:
# Instantiate the model

lr_model = LogisticRegression() # by default max_iter=100

In [None]:
# Fit logistic regression

lr_model.fit(X_train_data,Y_train_data)

In [None]:
# Predict using the model for test data

predictions = lr_model.predict(X_test_data)

In [None]:
# Check how the predictions look (the output of lr is a probability that is converted to binary classification using a cutoff)

predictions

In [None]:
# Output of logistic regression - predicted probability between 0 and 1

lr_model.predict_proba(X_test_data)

In [None]:
# Evaluate the model - check the accuracy of the model

print('training accuracy:', round(lr_model.score(X_train_data, Y_train_data),2))
print('test accuracy:',round(lr_model.score(X_test_data, Y_test_data),2))

# Profiles

Independent Variables (Predictors):
Age 	Family_Cnt 	New_fare 	Sex_male 	Pclass_2 	Pclass_3 	Embarked_Q 	Embarked_S 	Cabin_ind_1

Profile 1: Jack, a “20 year old” “third class” “male” passenger, won a hand of poker and his ticket to the land of the free.
jack_data = [20,0,0,1,0,1,0,1,0]

Profile 2: my_data = [36,2,0,1,1,0,0,1,0]



In [None]:
# define a single row of input data
jack_data = [20,0,0,1,0,1,0,1,0]
# predict the class label
jack = lr_model.predict([jack_data])
# summarize the predicted class
print('Jack - Survived: %d' % jack[0])

In [None]:
my_data = [36,2,0,1,1,0,0,1,0]
me = lr_model.predict([my_data])

print('Saikat - Survived: %d' % me[0])

In [None]:
modified_data = [36,2,0,0,1,0,0,1,0]
not_me = lr_model.predict([modified_data])

print('After tweaking data - Survived: %d' % not_me[0])

# Further analysis

In [None]:
# Accuracy Score
from sklearn.metrics import accuracy_score

print(accuracy_score(Y_test_data, predictions))

In [None]:
# Print the Confusion Matrix and slice it into four pieces

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test_data, predictions)
print('Confusion matrix\n\n', cm)

In [None]:
# visualize confusion matrix with seaborn heatmap

cm_matrix = pd.DataFrame(data=cm, columns=['Predicted -ve:0', 'Predicted +ve:1'], 
                                 index=['Actual -ve:0', 'Actual +ve:1'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

In [None]:
# Accuracy = (TP+TN)/(Total) (manually calculated is same as obtained from lr_model.score)

(90+54)/(90+15+20+54)

# Precision & Recall: When One Class is More Important

In many cases, it is more important to identify members of one class, e.g.,
Tax fraud
Credit default
Response to a promotional offer
Predicting delayed flights
However data has class imbalance.

In such cases, we will either increase or decrease the cut-off to better identify the important class for further attention

We may be willing to tolerate greater overall error in return for better identifying the important class.

Recall – we don’t want to miss anyone (false classifying is fine)

In [None]:
# Precision = TP/(TP+FP)
# True positive/ Predicted positive

print(54/(54+15))

In [None]:
# Recall = TP/(TP+FN)
# True positive/ Actual positive

print(54/(54+20))

In [None]:
#!pip install dmba # to install a package in jupyter otherwise can install in Anaconda shell

from dmba import classificationSummary
classificationSummary(Y_test_data, predictions, class_names = ['not-survived','survived'])

In [None]:
prediction_prob = lr_model.predict_proba(X_test_data)
prediction_prob

In [None]:
#Try different threshold and check precision and recall

prediction_prob = lr_model.predict_proba(X_test_data)[:,1] # take the second column which is the probability for survived (i.e., 1, column 0 is the probability for not survived (i.e., 0)


#predictions_threshold = 0.8 precision increase (since as threshold increases prediction of survived in denominator is less), however recall decrease

prediction_prob[prediction_prob > 0.8] = 1 
prediction_prob[prediction_prob <= 0.8] = 0



In [None]:
classificationSummary(Y_test_data, prediction_prob, class_names = ['not-survived','survived'])

In [None]:
# Precision = TP/(TP+FP)
# True positive/ Predicted positive

print(32/(32+3))

In [None]:
# Recall = TP/(TP+FN)
# True positive/ Actual positive

print(32/(32+42))

In [None]:
from sklearn.metrics import precision_recall_curve
# Generate precision recall curve values: precision, recall, thresholds
precision, recall, thresholds = precision_recall_curve(Y_test_data, prediction_prob)

# Plot Precision Recall curve
plt.plot(precision, recall)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision Recall Curve')
plt.show()

# ROC Curve

Plotting FPR (false-positive rate) (x-axis as 1- specificity) and TPR (true positive rate) (y-axis as sensitivity)

Whether algorithm has differentiating ability – if don’t have then 45-degree line, area under the curve is an accuracy measure, more than 0.5 for good model

If increased threshold value to 0.8, whether precision, recall will increase?
When threshold increases, the probability of detecting positive is becoming stricter, then predicted positives will become less the denominator of precision goes down, precision will go up and recall will come down

If decreased threshold value to 0.2, whether precision, recall will increase? 
The reverse happens predicted survived increase so denominator of precision becomes more precision will fall and recall will rise
Will increase recall since most people in 1 will be predicted but at same time more people who had died will be predicted as survived.

AUC = 0.79 means fairly differentiable model



In [None]:
from sklearn.metrics import roc_curve, auc

false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test_data, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)

print("AUC - ",format(roc_auc))
# Plot Precision Recall curve
plt.plot(false_positive_rate, true_positive_rate)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC Curve')
plt.show()

# F-score

In [None]:
from sklearn.metrics import classification_report 
print(classification_report(Y_test_data, predictions))

# Explore: Alternative library for  Classification using Logistic Regression - Stats Model

# Explore: Alternative Algorithm for Classification using Decision Tree - DecisionTreeClassifier from sklearn.tree