In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import operator

# Correlation matrixes
import scipy.stats as ss
import itertools
import math
from collections import Counter

# SelectKBest
from functools import partial
from sklearn.feature_selection import SelectKBest, chi2

# Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
import graphviz 


In [None]:
# Get data from csv
raw_data = pd.read_csv('../input/wmo-hurricane-survival-dataset/World_MO_Hurricane_Survival.csv')

# Check the df
raw_data.head()

# Data Preparation

**For the start need to clean the data from NaNs**

In [None]:
# Delete rows with nan because we have not much nans and we can juat clean them from the data
print("NaNs in the features:")
print(pd.isnull(raw_data).sum())

print("\nNumber of rows in data the before cleaning :")
print(len(raw_data.index))

# As we can see, the total number of rows is much bigger than number of missing values 
# and cleaning the NaN contain rows will not dramatically influence on the result
# So I decided to delete the rows containing NaNs

raw_data = raw_data.dropna()
print("\nNumber of rows in data the after cleaning :")
print(len(raw_data.index))

**Next,check the types of the features**

In [None]:
# Get info about features type
raw_data.dtypes

**As we can see, we got almost all the data types as "object". So we convert the types "object" to proper types.**

In [None]:
############ Converting the objects to proper data types ############

# Does not need the ID for prediction
raw_data = raw_data.drop(columns=['ID'])

# DOB to Age: I don't want the date of birth of a person, but I want to know the age of the person
raw_data['DOB'] = pd.to_datetime('today').year - pd.to_datetime(raw_data['DOB'], format='%m/%d/%Y').dt.year

#Create column for age category
bins= [20,30,40,50,60,70]
labels = ['21-30','31-40','41-50','51-60','61-70']
raw_data['AgeGroup'] = pd.cut(raw_data['DOB'], bins=bins, labels=labels, right=False)

# I used this guide: https://pbpython.com/categorical-encoding.html
# Object columns to category: I want to categorize the string data to understand what I have in my hands
for col in ['M_STATUS', 'SALARY', 'EDU_DATA', 'EMP_DATA', 'REL_ORIEN', 'FAV_TV', 'PREF_CAR', 'GENDER',
           'FAV_CUIS', 'FAV_MUSIC', 'ENDU_LEVEL', 'FAV_SPORT', 'FAV_COLR', 'NEWS_SOURCE', 'DIST_FRM_COAST',
           'MNTLY_TRAVEL', 'GEN_MOVIES', 'FAV_SUBJ', 'ALCOHOL', 'FAV_SUPERHERO']:
    raw_data[col] = raw_data[col].astype('category')

# I don't understand that feature from its values and we don't have description of this feature in the origin dataset
# On the other hand we have 'DIST_FRM_COAST' feature giving us the data about distance from the coast. So I decided to remove that feature.
raw_data = raw_data.drop(columns=['Dist_Coast'])

# Class label to boolean: survived = true, not survived = false
mapping_bool_dict = {'x': True, 'y': False}
raw_data['Class'] = raw_data['Class'].map(mapping_bool_dict)

# Rename the features to more suitable names
raw_data.rename(columns = {'DOB': 'Age', 'M_STATUS': 'Marital_Status', 'SALARY': 'Salary', 'EDU_DATA': 'Education',
                          'EMP_DATA': 'Employment', 'REL_ORIEN': 'Religion', 'FAV_TV': 'Fav_TV_Show',
                          'PREF_CAR': 'Fav_Car', 'GENDER': 'Gender', 'FAV_CUIS': 'Fav_Cuisine', 'FAV_MUSIC': 'Fav_Music',
                          'ENDU_LEVEL': 'Endurance', 'FAV_SPORT': 'Fav_Sport', 'FAV_COLR': 'Fav_Color',
                          'NEWS_SOURCE': 'Fav_News_Source', 'DIST_FRM_COAST': 'Distance_From_Coast', 
                          'MNTLY_TRAVEL': 'Monthly_Travel_Distance', 'GEN_MOVIES': 'Fav_Movies_Genre',
                          'FAV_SUBJ': 'Fav_Subject', 'ALCOHOL': 'Fav_Alcohol', 'FAV_SUPERHERO': 'Fav_Superhero',
                          'Class': 'Is_Survived'}, inplace = True)


raw_data.dtypes

In [None]:
# Check if the class is balanced
raw_data.groupby('Is_Survived').describe()

**As we can see, we have positive values are approximately same as negative values, so the data is balanced**

In [None]:
#### Label Encoding ####
label_encoding_raw_data = raw_data
for col in label_encoding_raw_data:
    if label_encoding_raw_data[col].dtype.name == 'category':
        label_encoding_raw_data[col] = label_encoding_raw_data[col].cat.codes
label_encoding_raw_data

In [None]:
###### Feature Statistics ######
plt.figure(figsize=(25,10))

plt.subplot(3, 3, 1)
sns.countplot(x='AgeGroup', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 2)
sns.countplot(x='Marital_Status', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 3)
sns.countplot(x='Salary', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 4)
sns.countplot(x='Education', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 5)
sns.countplot(x='Employment', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 6)
sns.countplot(x='Religion', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 7)
sns.countplot(x='Fav_TV_Show', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 8)
sns.countplot(x='Fav_Car', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 9)
sns.countplot(x='Gender', hue='Is_Survived', data=label_encoding_raw_data)

In [None]:
###### Feature Statistics ######
plt.figure(figsize=(25,10))

plt.subplot(3, 3, 1)
sns.countplot(x='Fav_Cuisine', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 2)
sns.countplot(x='Fav_Music', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 3)
sns.countplot(x='Endurance', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 4)
sns.countplot(x='Fav_Sport', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 5)
sns.countplot(x='Fav_Color', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 6)
sns.countplot(x='Fav_News_Source', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 7)
sns.countplot(x='Distance_From_Coast', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 8)
sns.countplot(x='Monthly_Travel_Distance', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(3, 3, 9)
sns.countplot(x='Fav_Movies_Genre', hue='Is_Survived', data=label_encoding_raw_data)

In [None]:
###### Feature Statistics ######
plt.figure(figsize=(25,10))

plt.subplot(1, 3, 1)
sns.countplot(x='Fav_Subject', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(1, 3, 2)
sns.countplot(x='Fav_Alcohol', hue='Is_Survived', data=label_encoding_raw_data)

plt.subplot(1, 3, 3)
sns.countplot(x='Fav_Superhero', hue='Is_Survived', data=label_encoding_raw_data)


In [None]:
# Sources:
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
# https://stackoverflow.com/questions/51859894/how-to-plot-a-cramer-s-v-heatmap-for-categorical-features
# https://github.com/shakedzy/dython/blob/master/dython/nominal.py

############ Getting Correclation Matrixes: Cramér’s V and Theil’s U ############
# It's very a big Correclation Matrix with dummies, 
# so I found how to get the Correclation Matrixes with category types

# Save the all the scores in the dictionary 
all_scores = {}

# Correclation Matrix with Cramér’s V
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

# Correclation Matrix with Theil’s U
def conditional_entropy(x,
                        y,
                        nan_strategy='replace',
                        nan_replace_value=0.0,
                        log_base: float = math.e):
    if nan_strategy == 'replace':
        x = np.array([v if v == v and v is not None else value for v in x])
        y = np.array([v if v == v and v is not None else value for v in y])
    elif nan_strategy == 'drop':
        x, y = remove_incomplete_samples(x, y)
    y_counter = Counter(y)
    xy_counter = Counter(list(zip(x, y)))
    total_occurrences = sum(y_counter.values())
    entropy = 0.0
    for xy in xy_counter.keys():
        p_xy = xy_counter[xy] / total_occurrences
        p_y = y_counter[xy[1]] / total_occurrences
        entropy += p_xy * math.log(p_y / p_xy, log_base)
    return entropy

def theils_u(x, y):
    s_xy = conditional_entropy(x,y)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
    s_x = ss.entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x

cols = list(raw_data) 
corrM_cramers_v = np.zeros((len(cols),len(cols)))
corrM_theils_u = np.zeros((len(cols),len(cols)))

for col1 in cols: 
    idx1 = cols.index(col1) 
    corrM_cramers_v[idx1, idx1] = cramers_v(raw_data[col1], raw_data[col1])
    corrM_theils_u[idx1, idx1] = theils_u(raw_data[col1], raw_data[col1])

for col1, col2 in itertools.combinations(cols, 2): 
    idx1, idx2 = cols.index(col1), cols.index(col2)
    score_cramers_v = cramers_v(raw_data[col1], raw_data[col2])
    score_theils_u = theils_u(raw_data[col1], raw_data[col2]) 
    corrM_cramers_v[idx1, idx2] = score_cramers_v
    corrM_cramers_v[idx2, idx1] = corrM_cramers_v[idx1, idx2]
    corrM_theils_u[idx1, idx2] = score_theils_u
    corrM_theils_u[idx2, idx1] = corrM_theils_u[idx1, idx2]
    
    if col1 == 'Is_Survived':
        all_scores[col2] = [score_cramers_v, score_theils_u]
    elif col2 == 'Is_Survived':
        all_scores[col1] = [score_cramers_v, score_theils_u]


corr_cramers_v = pd.DataFrame(corrM_cramers_v, index=cols, columns=cols)
corr_theils_u = pd.DataFrame(corrM_theils_u, index=cols, columns=cols)


In [None]:
# Print Cramer V correlation matrix
fig, ax = plt.subplots(figsize=(25, 25)) 
ax = sns.heatmap(corr_cramers_v, annot=True, ax=ax)
ax.set_title("Cramer V Correlation between Variables")

In [None]:
# Print Theil’s U correlation matrix
fig, ax = plt.subplots(figsize=(25, 25)) 
ax = sns.heatmap(corr_theils_u, annot=True, ax=ax)
ax.set_title("Theil’s U Correlation between Variables")

In [None]:
############ Select best features with SelectKBest ############

def select_k_best_with_plot(X, y, k_plot):
    selector = SelectKBest(chi2, k='all').fit(X, y)
    
    # normalization
    scores = -np.log10(selector.pvalues_)
    scores /= scores.max()

    indices = np.argsort(scores)[::-1]

    # To get your top 15 feature names
    features = []
    for i in range(k_plot):
        features.append(X.columns[indices[i]])
        all_scores[X.columns[indices[i]]].append(scores[indices[i]])

    # Now plot
    f, ax = plt.subplots(figsize=(25,10)) # set the size that you'd like (width, height)
    plt.bar(features[:10], scores[indices[range(10)]], align='center')
    ax.legend(fontsize = 14)
    plt.show()


In [None]:
# SelectKBest with Lable Encoding data
y = label_encoding_raw_data['Is_Survived']
X = label_encoding_raw_data.drop(columns=['Is_Survived'])
select_k_best_with_plot(X, y, 21)

In [None]:
##### Get score to the feature #####

# Sum all the scores from Correlation Matrixes and SelectKBest
final_scores = {}
for feature in all_scores:
    final_scores[feature] = sum(all_scores[feature])

# sort the features by scores
final_scores = dict(sorted(final_scores.items(), key=operator.itemgetter(1),reverse=True))

# The best 10 features
list(final_scores)[:10]

In [None]:
#### Relevant Features ####

# features I will not use to modelling
to_delete_features = list(final_scores)[10:]

# Leave the features from the articles in final data
to_delete_features.remove('Gender')
to_delete_features.remove('Endurance')

# get relevant features to df
data = raw_data.drop(columns=to_delete_features)

# print final features
features = data.drop(columns=['Is_Survived'])
list(features.columns) 

In [None]:
#### One Hot Encoding ####

category_cols = list(data)
category_cols.remove('Age')
category_cols.remove('Is_Survived')

data = pd.get_dummies(data, columns=category_cols, drop_first=True)
data.columns

# Modeling and Evaluation

In [None]:
# Prepare train test
y = data['Is_Survived']
X = data.drop(columns=['Is_Survived'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [None]:
##### Random Forest Gini #####
rf = RandomForestClassifier(n_estimators = 1000,
                                    max_depth=115,
                                    min_samples_split=4,
                                    min_samples_leaf=1,
                                    criterion='gini'
                                   )
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print("Random Forest Gini")
print("Accuracy: ", accuracy_score(y_test, rf_pred))
print("Recall: ", recall_score(y_test, rf_pred))
print("Precision: ", precision_score(y_test, rf_pred))
print("F-Score: ", f1_score(y_test, rf_pred))

# Confusion Matrix
plt.figure(figsize=[7, 6])
plt.title('Random Forest Gini', fontsize = 15) 

cm = confusion_matrix(y_test,rf_pred)
sns.heatmap(cm, annot=cm, fmt='g', xticklabels=['Predicted: No','Predicted: Yes'], yticklabels=['Actual: No','Actual: Yes'], cmap='copper')


In [None]:
##### Random Forest Entropy #####
rf = RandomForestClassifier(n_estimators = 1000,
                                    max_depth=115,
                                    min_samples_split=4,
                                    min_samples_leaf=1,
                                    criterion='entropy'
                                   )
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print("Random Forest Entropy")
print("Accuracy: ", accuracy_score(y_test, rf_pred))
print("Recall: ", recall_score(y_test, rf_pred))
print("Precision: ", precision_score(y_test, rf_pred))
print("F-Score: ", f1_score(y_test, rf_pred))

# Confusion Matrix
plt.figure(figsize=[7, 6])
plt.title('Random Forest Entropy', fontsize = 15) 

cm = confusion_matrix(y_test,rf_pred)
sns.heatmap(cm, annot=cm, fmt='g', xticklabels=['Predicted: No','Predicted: Yes'], yticklabels=['Actual: No','Actual: Yes'], cmap='copper')


In [None]:
##### Logistic Regression #####

lr = LogisticRegression()
lr.fit(X_train, y_train)

lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

print("Logistic Regression")
print("Accuracy: ", accuracy_score(y_test, lr_pred))
print("Recall: ", recall_score(y_test, lr_pred))
print("Precision: ", precision_score(y_test, lr_pred))
print("F-Score: ", f1_score(y_test, lr_pred))

# Confusion Matrix
plt.figure(figsize=[7, 6])
plt.title('Logistic Regression', fontsize = 15)

cm = confusion_matrix(y_test,lr_pred)
sns.heatmap(cm, annot=cm, fmt='g', xticklabels=['Predicted: No','Predicted: Yes'], yticklabels=['Actual: No','Actual: Yes'], cmap='copper')


In [None]:
##### SVM #####

svm_model = svm.LinearSVC(C=0.01)
svm_model.fit(X_train, y_train)

svm_model_pred = svm_model.predict(X_test)

print("SVM")
print("Accuracy: ", accuracy_score(y_test, svm_model_pred))
print("Recall: ", recall_score(y_test, svm_model_pred))
print("Precision: ", precision_score(y_test, svm_model_pred))
print("F-Score: ", f1_score(y_test, svm_model_pred))

# Confusion Matrix
plt.figure(figsize=[7, 6])
plt.title('SVM', fontsize = 15)

cm = confusion_matrix(y_test,svm_model_pred)
sns.heatmap(cm, annot=cm, fmt='g', xticklabels=['Predicted: No','Predicted: Yes'], yticklabels=['Actual: No','Actual: Yes'], cmap='copper')



In [None]:
##### Decision Tree #####

dt = tree.DecisionTreeClassifier()
dt.fit(X_train, y_train)

dt_pred = dt.predict(X_test)

print("Decision Tree")
print("Accuracy: ", accuracy_score(y_test, dt_pred))
print("Recall: ", recall_score(y_test, dt_pred))
print("Precision: ", precision_score(y_test, dt_pred))
print("F-Score: ", f1_score(y_test, dt_pred))

# Confusion Matrix
plt.figure(figsize=[7, 6])
plt.title('Decision Tree', fontsize = 15)

cm = confusion_matrix(y_test,dt_pred)
sns.heatmap(cm, annot=cm, fmt='g', xticklabels=['Predicted: No','Predicted: Yes'], yticklabels=['Actual: No','Actual: Yes'], cmap='copper')

In [None]:
##### Decision Tree plot #####
dot_data = tree.export_graphviz(dt, out_file=None, 
                      feature_names=list(X),   
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)  
graph.render("df_tree",view=True)
f = open("df_tree","w+")
f.write(dot_data)
f.close()
graph