# Women in Data Science
## Week 3 - Data Understanding

### Stephen Redmond
Enterprise Insight Studio Lead

In [0]:
# Connect to my Google Drive
from google.colab import drive



In [0]:
!ls '/content/drive/My Drive/WIDS'

#Loading the DataFrame from a CSV file using pandas

In [0]:
# import the pandas library
# Traditionally, this is loaded and named as "pd"
import pandas as pd
# The numpy library is traditionally "np"
import numpy as np

# Titanic data from https://www.kaggle.com/c/titanic/data
# Load the Titanic data file from my Google Drive
df = pd.read_csv('/content/drive/My Drive/WIDS/titanic/train.csv')


# Data Dictionary
| Variable | Definition                                 | Key                                            |
|----------|--------------------------------------------|------------------------------------------------|
| survival | Survival                                   | 0 = No, 1 = Yes                                |
| pclass   | Ticket class                               | 1 = 1st, 2 = 2nd, 3 = 3rd                      |
| sex      | Sex                                        |                                                |
| Age      | Age in years                               |                                                |
| sibsp    | # of siblings / spouses aboard the Titanic |                                                |
| parch    | # of parents / children aboard the Titanic |                                                |
| ticket   | Ticket number                              |                                                |
| fare     | Passenger fare                             |                                                |
| cabin    | Cabin number                               |                                                |
| embarked | Port of Embarkation                        | C = Cherbourg, Q = Queenstown, S = Southampton |

In [0]:
# Have a quick look
df.head()

In [0]:
# And the non-numeric fields
df[["Name","Sex","Ticket","Cabin","Embarked"]].describe()

In [0]:
# Some of these fields are less than useful:
# - Cabin has many missing values
# - Name, Ticket and PassengerId have too many unique values
df = df.drop(columns = ['Cabin','Name','Ticket','PassengerId'])

# The Embarked field has 2 missing - let's just assume it was Southampton
df["Embarked"].fillna("S", inplace=True)

# Age has NaN values ... what should we do?
df['Age'].fillna(df['Age'].mode()[0], inplace=True)

# More fun with scikit-learn
We previously looked at some useful initial uses of sklearn, like splitting a dataset into training and test.

Let's play with more models.

In [0]:
# Create a new feature called FamilySize
df['FamilySize'] = df['SibSp'] + df['Parch'] 
df['IsAlone'] = 0
df.loc[df['FamilySize'] == 0, 'IsAlone'] = 1

# Splitting into train and test
from sklearn.model_selection import train_test_split

train_X, test_X = train_test_split(df, test_size = 0.2) 

# Train my models with a small # of features
from sklearn import tree
dt = tree.DecisionTreeClassifier()

X = pd.get_dummies(train_X[["Pclass","Sex","Age","Fare","FamilySize"]],drop_first=True) 
y = train_X["Survived"]
dt = dt.fit(X, y)
X_test = pd.get_dummies(test_X[["Pclass","Sex","Age","Fare","FamilySize"]],drop_first=True) #.to_numpy()
y_test = test_X["Survived"]
y_pred = dt.predict(X_test)
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)


In [0]:
# Calcualte F1 score: F1 = 2 * (precision * recall) / (precision + recall)
from sklearn.metrics import f1_score

f1_score(y_test, y_pred)

# Random Forest classifier
"Ensemble" method - builds many decision trees and will return the best value from all of them

In [0]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X=X,y=y)

y_pred = rf.predict(X_test)

# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [0]:
f1_score(y_test, y_pred)

# Regression Model

In [0]:
# Need to scale our data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Fit our scaler to the OneHot encoded data and transform it
X = scaler.fit_transform(pd.get_dummies(train_X[["Pclass","Sex","Age","Fare","FamilySize"]],drop_first=True)) 
y = train_X["Survived"]
X_test_scaled = scaler.transform(X_test)

In [0]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X, y)


In [0]:
print("R^2: " + str(reg.score(X, y)) + "\nCoefficients: " + str(reg.coef_) + "\nIntercept:" + str(reg.intercept_))

In [0]:
y_pred = reg.predict(X_test_scaled)

# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

## Not a great result!!!

# Logistical regression

In [0]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0).fit(X, y)
y_pred = lr.predict(X_test_scaled)

# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)


In [0]:
f1_score(y_test, y_pred)

In [0]:
# Does scaling change the result?
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(pd.get_dummies(train_X[["Pclass","Sex","Age","Fare","FamilySize"]],drop_first=True)) 
X_test_scaled = scaler.transform(X_test)

lr = LogisticRegression(random_state=0).fit(X, y)
y_pred = lr.predict(X_test_scaled)

# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [0]:
f1_score(y_test, y_pred)

# Neural Network using Tensorflow


In [0]:
# Need to scale our data - using a Standard Scaler rather than MinMax (why?)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Do a new split of our training data
X = pd.get_dummies(df[["Pclass","Sex","Age","Fare","FamilySize"]],drop_first=True)
y = df['Survived'] 

# Convert y to a categorical value
#from keras.utils import to_categorical
#y = to_categorical(dy, num_classes=2)


# We can create all our splits in one statement
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Fit our scaler to the OneHot encoded data and transform it
# Usually 2 ways with all sklearn libraries - a separate fit and transform:
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train) 
# Or fit and transform in one fit_transform step
#X_train_scaled = scaler.fit_transform(X_train) 

# Now, tranform the test data with the same scaler - note, no "fit"!
X_test_scaled = scaler.transform(X_test)



In [0]:
# Fit a Deep Learning model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(5, input_shape=(5,), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Split out a validation set to use
x_trn, x_val, y_trn, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, shuffle= True)

hist = model.fit(x_trn, y_trn, epochs=5, batch_size=5, shuffle=True, validation_data=(x_val, y_val))

test_loss, test_acc = model.evaluate(X_test_scaled, y_test, verbose=2)

print('\nTest accuracy:', test_acc)
