# Women in Data Science
## Week 4 - Model Evaluation

### Stephen Redmond
Enterprise Insight Studio Lead

In [0]:
# Connect to my Google Drive
from google.colab import drive



In [0]:
!ls '/content/drive/My Drive/WIDS'

In [0]:
# We'll do some plotting later
%matplotlib inline
import matplotlib.pyplot as plt


#Loading the DataFrame from a CSV file using pandas

In [0]:
# import the pandas library
# Traditionally, this is loaded and named as "pd"
import pandas as pd
# The numpy library is traditionally "np"
import numpy as np

# Titanic data from https://www.kaggle.com/c/titanic/data
# Load the Titanic data file from my Google Drive
df = pd.read_csv('/content/drive/My Drive/WIDS/titanic/train.csv')


# Data Dictionary
| Variable | Definition                                 | Key                                            |
|----------|--------------------------------------------|------------------------------------------------|
| survival | Survival                                   | 0 = No, 1 = Yes                                |
| pclass   | Ticket class                               | 1 = 1st, 2 = 2nd, 3 = 3rd                      |
| sex      | Sex                                        |                                                |
| Age      | Age in years                               |                                                |
| sibsp    | # of siblings / spouses aboard the Titanic |                                                |
| parch    | # of parents / children aboard the Titanic |                                                |
| ticket   | Ticket number                              |                                                |
| fare     | Passenger fare                             |                                                |
| cabin    | Cabin number                               |                                                |
| embarked | Port of Embarkation                        | C = Cherbourg, Q = Queenstown, S = Southampton |

In [0]:
# Have a quick look
df.head()

In [0]:
# And the non-numeric fields
df[["Name","Sex","Ticket","Cabin","Embarked"]].describe()

In [0]:
# Some of these fields are less than useful:
# - Cabin has many missing values
# - Name, Ticket and PassengerId have too many unique values
df = df.drop(columns = ['Cabin','Name','Ticket','PassengerId'])

# The Embarked field has 2 missing - let's just assume it was Southampton
df["Embarked"].fillna("S", inplace=True)

# Age has NaN values ... what should we do?
df['Age'].fillna(df['Age'].mode()[0], inplace=True)

# Model evaluation
We have built our models, what are the ways we can evaluate their performance?

Let's start with classification

In [0]:
# Build the model - we've done this a lot already

# Create a new feature called FamilySize
df['FamilySize'] = df['SibSp'] + df['Parch'] 
df['IsAlone'] = 0
df.loc[df['FamilySize'] == 0, 'IsAlone'] = 1

# Splitting into train and test
from sklearn.model_selection import train_test_split

train_X, test_X = train_test_split(df, test_size = 0.2) 

# Train my models with a small # of features
from sklearn import tree
dt = tree.DecisionTreeClassifier()

X = pd.get_dummies(train_X[["Pclass","Sex","Age","Fare","FamilySize"]],drop_first=True) 
y = train_X["Survived"]
dt = dt.fit(X, y)
X_test = pd.get_dummies(test_X[["Pclass","Sex","Age","Fare","FamilySize"]],drop_first=True) #.to_numpy()
y_test = test_X["Survived"]

In [0]:
# Get the predictions and generate a confusion matrix
y_pred = dt.predict(X_test)
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)


In [0]:
#!pip install scikit-plot

In [0]:
# Or a more graphical one?
import scikitplot as skplt

skplt.metrics.plot_confusion_matrix(y_test, y_pred, title="Confusion Matrix")
plt.show()

In [0]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [0]:
# Calcualte F1 score: F1 = 2 * (precision * recall) / (precision + recall)
from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve, auc, accuracy_score

acc = accuracy_score(y_test, y_pred)
resTable=[['Accuracy  :', acc]]
prec = precision_score(y_test, y_pred)
resTable.append(['Precision :', prec])
recall = recall_score(y_test, y_pred)
resTable.append(['Recall    :', recall])
f1 = f1_score(y_test, y_pred)
resTable.append(['F1 Score  :', f1])

# ROC / AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
AreaUnderCurve = auc(fpr, tpr)
resTable.append(['FPR       :', fpr])
resTable.append(['TPR       :', tpr])
resTable.append(['Threshold :', thresholds])
resTable.append(['AUC       :', AreaUnderCurve])

print([print(y[0] + '\t' + str(y[1])) for y in resTable])


In [0]:
# Plot the ROC Curve

plt.plot(fpr, tpr, color='darkorange',
         lw=2, label='ROC curve (area = %0.2f)' % AreaUnderCurve)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

# Regression Model

In [0]:
# Need to scale our data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Fit our scaler to the OneHot encoded data and transform it
X = scaler.fit_transform(pd.get_dummies(train_X[["Survived","Pclass","Sex","Age","FamilySize"]],drop_first=True)) 
y = train_X["Fare"]
X_test = pd.get_dummies(test_X[["Survived","Pclass","Sex","Age","FamilySize"]],drop_first=True) #.to_numpy()
y_test = test_X["Fare"]
X_test_scaled = scaler.transform(X_test)

In [0]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X, y)


In [0]:
print("R^2: " + str(reg.score(X, y)) + "\nCoefficients: " + str(reg.coef_) + "\nIntercept:" + str(reg.intercept_))

In [0]:
# Predict using the model
y_pred = reg.predict(X_test_scaled)


In [0]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

rmse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(rmse, mae)

# Hyper parameter tuning
Manual or automatic changing the parameters used to train a model until we get the best results


In [0]:
# Import the RF Classifier module
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()

# Setup the data with dummy coding
X = pd.get_dummies(train_X[["Pclass","Sex","Age","Fare","FamilySize"]],drop_first=True) 
y = train_X["Survived"]
X_test = pd.get_dummies(test_X[["Pclass","Sex","Age","Fare","FamilySize"]],drop_first=True) 
y_test = test_X["Survived"]

# import the GridSearch module
from sklearn.model_selection import GridSearchCV

# Setup some parameters to run through
param_grid = {'max_depth': np.arange(1, 30, 3),
             'criterion': ['gini', 'entropy']}

# Find the best combo of parameters
my_tuned_tree = GridSearchCV(RandomForestClassifier(), param_grid) #, scoring='f1_weighted')
my_tuned_tree.fit(X, y)
print(my_tuned_tree.best_score_)


In [0]:
# Predict using the tuned tree
y_pred = my_tuned_tree.predict(X_test)
skplt.metrics.plot_confusion_matrix(y_test, y_pred, title="Confusion Matrix")
plt.show()
print(classification_report(y_test,y_pred))

In [0]:
# What are the scoring options for the Grid Search?
import sklearn.metrics as skm
skm.SCORERS.keys()