In [2]:
# Load necessary libraries
import numpy as np
import pandas as pd
import yellowbrick
from yellowbrick.features import Rank2D
from yellowbrick.style import set_palette
from yellowbrick.features import ParallelCoordinates
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ROCAUC

ModuleNotFoundError: No module named 'yellowbrick'

In [None]:
# Read in data, view the first 5 rows

df = pd.read_csv("train.csv")
df.head(5)

In [None]:
# Display the dimensions of the file

df.shape

In [None]:
print("Describe Data")
print(df.describe())
print("Summarized Data")
print(df.describe(include=['O']))

In [None]:
# Create histograms


%matplotlib inline
plt.rcParams['figure.figsize']=(20,10)
fig, axes = plt.subplots(nrows=2, ncols=2)
num_features = ['Age','SibSp','Parch','Fare']
xaxes=num_features
yaxes = ['Counts','Counts','Counts','Counts']
axes = axes.ravel()
for idx, ax in enumerate(axes):
    ax.hist(df[num_features[idx]].dropna(),bins=40)
    ax.set_xlabel(xaxes[idx],fontsize=20)
    ax.set_ylabel(yaxes[idx],fontsize=20)
    ax.tick_params(axis='both',labelsize=15)
plt.show()


In [None]:
# Make some bar charts



%matplotlib inline
plt.rcParams['figure.figsize']=(20,10)
fig, axes = plt.subplots(nrows=2, ncols=2)
X_Survived = df.replace({'Survived': {1:'yes',0:'no'}}).groupby('Survived').size().reset_index(name='Counts')['Survived']
Y_Survived = df.replace({'Survived':{1:'yes',0:'no'}}).groupby('Survived').size().reset_index(name='Counts')['Counts']
axes[0,0].bar(X_Survived,Y_Survived)
axes[0,0].set_title('Survived',fontsize=25)
axes[0,0].set_ylabel('Counts', fontsize=20)
axes[0,0].tick_params(axis='both',labelsize=15)

X_Pclass = df.replace({'Pclass':{1:'1st', 2:'2nd',3:'3rd'}}).groupby('Pclass').size().reset_index(name='Counts')['Pclass']
Y_Pclass = df.replace({'Pclass':{1:'1st',2:'2nd',3:'3rd'}}).groupby('Pclass').size().reset_index(name='Counts')['Counts']
axes[0,1].bar(X_Pclass,Y_Pclass)
axes[0,1].set_title('Pclass',fontsize=25)
axes[0,1].set_ylabel('Counts',fontsize=20)
axes[0,1].tick_params(axis='both',labelsize=15)

X_Sex = df.groupby('Sex').size().reset_index(name='Counts')['Sex']
Y_Sex = df.groupby('Sex').size().reset_index(name='Counts')['Counts']
axes[1,0].bar(X_Sex,Y_Sex)
axes[1,0].set_title('Sex',fontsize=25)
axes[1,0].set_ylabel('Counts',fontsize=20)
axes[1,0].tick_params(axis='both',labelsize=15)

X_Embarked = df.groupby('Embarked').size().reset_index(name='Counts')['Embarked']
Y_Embarked = df.groupby('Embarked').size().reset_index(name='Counts')['Counts']
axes[1,1].bar(X_Embarked, Y_Embarked)
axes[1,1].set_title('Embarked', fontsize=25)
axes[1,1].set_ylabel('Counts', fontsize=20)
axes[1,1].tick_params(axis='both', labelsize=15)
plt.show()

In [None]:
# Calculate the Pearson Ranking to determine correlation


%matplotlib inline
plt.rcParams['figure.figsize']=(15,7)
X = df[num_features].to_numpy() # this or df[num_features].values work :-)
visualizer = Rank2D(features=num_features,algorithm='pearson')
visualizer.fit(X)
visualizer.transform(X)
plt.show()

In [None]:
%matplotlib inline
plt.rcParams['figure.figsize']=(15,7)
plt.rcParams['font.size']=50
set_palette('sns_bright')
classes=['Not-survived','Survived']
num_features = ['Age','SibSp','Parch','Fare']
df_norm=df.copy()
for feature in num_features:
    df_norm[feature] = (df[feature] - df[feature].mean(skipna=True))/(df[feature].max(skipna=True)-df[feature].min(skipna=True))
df[feature].min(skipna=True)
X = df_norm[num_features].values
y = df.Survived.values


visualizer = ParallelCoordinates(classes=classes, features=num_features)
visualizer.fit(X,y)
visualizer.transform(X)
plt.show()

In [None]:
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 10)
fig, axes = plt.subplots(nrows = 2, ncols = 2)



Sex_survived = df.replace({'Survived': {1: 'Survived', 0: 'Not-survived'}})[df['Survived']==1]['Sex'].value_counts()
Sex_not_survived = df.replace({'Survived': {1: 'Survived', 0: 'Not-survived'}})[df['Survived']==0]['Sex'].value_counts()
Sex_not_survived = Sex_not_survived.reindex(index = Sex_survived.index)



p1 = axes[0,0].bar(Sex_survived.index, Sex_survived.values)
p2 = axes[0,0].bar(Sex_not_survived.index, Sex_not_survived.values, bottom=Sex_survived.values)
axes[0,0].set_title('Sex', fontsize=25)
axes[0,0].set_ylabel('Counts', fontsize=20)
axes[0,0].tick_params(axis='both', labelsize=15)
axes[0,0].legend((p1[0], p2[0]), ('Survived', 'Not-survived'), fontsize = 15)



Pclass_survived = df.replace({'Survived': {1: 'Survived', 0: 'Not-survived'}}).replace({'Pclass': {1: '1st', 2: '2nd', 3: '3rd'}})[df['Survived']==1]['Pclass'].value_counts()
Pclass_not_survived = df.replace({'Survived': {1: 'Survived', 0: 'Not-survived'}}).replace({'Pclass': {1: '1st', 2: '2nd', 3: '3rd'}})[df['Survived']==0]['Pclass'].value_counts()
Pclass_not_survived = Pclass_not_survived.reindex(index = Pclass_survived.index)
# make the bar plot
p3 = axes[0,1].bar(Pclass_survived.index, Pclass_survived.values)
p4 = axes[0,1].bar(Pclass_not_survived.index, Pclass_not_survived.values, bottom=Pclass_survived.values)
axes[0,1].set_title('Pclass', fontsize=25)
axes[0,1].set_ylabel('Counts', fontsize=20)
axes[0,1].tick_params(axis='both', labelsize=15)
axes[0,1].legend((p3[0], p4[0]), ('Survived', 'Not-survived'), fontsize = 15)


Embarked_survived = df.replace({'Survived': {1: 'Survived', 0: 'Not-survived'}})[df['Survived']==1]['Embarked'].value_counts()
Embarked_not_survived = df.replace({'Survived': {1: 'Survived', 0: 'Not-survived'}})[df['Survived']==0]['Embarked'].value_counts()
Embarked_not_survived = Embarked_not_survived.reindex(index = Embarked_survived.index)


p5 = axes[1,0].bar(Embarked_survived.index, Embarked_survived.values)
p6 = axes[1,0].bar(Embarked_not_survived.index, Embarked_not_survived.values, bottom=Embarked_survived.values)
axes[1,0].set_title('Embarked', fontsize=25)
axes[1,0].set_ylabel('Counts', fontsize=20)
axes[1,0].tick_params(axis='both', labelsize=15)
axes[1,0].legend((p5[0], p6[0]), ('Survived', 'Not-survived'), fontsize = 15)
plt.show()

In [None]:
# Create a function to fill in the missing values with the median value

def fill_na_median(data,inplace=True):
    return data.fillna(data.median(), inplace=inplace)

In [None]:
fill_na_median(df['Age'])

In [None]:
print(df['Age'].describe())

In [None]:
# Create a function to fill in the missing values with the most represented value

def fill_na_most(data,inplace=True):
    return data.fillna('S', inplace=inplace)

In [None]:
fill_na_most(df['Embarked'])

In [None]:
print(df['Embarked'].describe())

In [None]:
# Create a function to return the log of the data

def log_transformation(data):
    return data.apply(np.log1p)

In [None]:
# apply this function to the Fare feature as it is skewed

df['Fare_log1p'] = log_transformation(df['Fare'])

In [None]:
# Check to see what that new feature looks like

print(df.describe())

In [None]:
# histogram it up for this new feature in comparison to the og fare feature


plt.rcParams['figure.figsize']=(10,5)
fig, axes = plt.subplots(nrows = 1, ncols = 2)

axes[0].hist(df['Fare'], bins=40)
axes[0].set_ylabel('Counts', fontsize=20)
axes[0].set_xlabel('Fare', fontsize=20)
axes[0].tick_params(axis='both',labelsize=15)


axes[1].hist(df['Fare_log1p'], bins=40)
axes[1].set_xlabel('Fare_log1p', fontsize=20)
axes[1].set_ylabel('Counts',fontsize=20)
axes[1].tick_params(axis='both', labelsize=15)
plt.show()

In [None]:
# Convert categorical features into numbers

cat_features = ['Pclass', 'Sex','Embarked']
df_cat = df[cat_features]
df_cat = df_cat.replace({'Pclass':{1:'1st', 2:'2nd',3:'3rd'}})

df_cat_dummies = pd.get_dummies(df_cat)

In [None]:
print(df_cat_dummies.head(8))

In [None]:
# create a brand new dataframe that is just the numerical features
features_model = ['Age', 'SibSp', 'Parch', 'Fare_log1p'] 

# concatenate the dummy variables with the numerical features above
data_model_X = pd.concat([data[features_model], data_cat_dummies], axis=1)

In [None]:
# create a dataframe for the target variable
data_model_y = data.replace({'Survived': {1: 'Survived', 0: 'Not_survived'}})['Survived']

In [None]:
# split the data into testing & training datasets (testing=validation)
X_train, X_val, y_train, y_val = train_test_split(data_model_X, data_model_y, test_size =0.3, random_state=11)

In [None]:
# How many samples are in each set?
print("No. of samples in training set: ", X_train.shape[0])
print("No. of samples in validation set:", X_val.shape[0])

In [None]:
# how many survived and didn't in the training/validation data?

print('\n')
print('No. of survived and not-survived in the training set:')
print(y_train.value_counts())
print('\n')
print('No. of survived and not-survived in the validation set:')
print(y_val.value_counts())

In [None]:
# Instantiate the model

model = LogisticRegression()

In [None]:
# Create a confusion matrix

classes = ['Not_survived','Survived']
cm = ConfusionMatrix(model, classes=classes, percent=False)
cm.fit(X_train, y_train)
cm.score(X_val, y_val)
for label in cm.ax.texts:
    label.set_size(20)
cm.poof()

In [None]:
# Visualize the Precision, Recall, & F1 score

plt.rcParams['figure.figsize'] = (15, 7)
plt.rcParams['font.size'] = 20
visualizer = ClassificationReport(model, classes=classes)
visualizer.fit(X_train, y_train)
visualizer.score(X_val, y_val)
g = visualizer.poof()

In [None]:
# create the ROC AUC visual

visualizer = ROCAUC(model)

visualizer.fit(X_train, y_train)
visualizer.score(X_val, y_val)
g = visualizer.poof()