# ****DATA LOADING****

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

In [None]:
df_train = pd.read_csv(r"../input/spaceship-titanic/train.csv")
df_train.head(10)

# ****DESCRIPTIVE ANALYSIS****

In [None]:
df_train.info()

In [None]:
df_train.shape

In [None]:
df_train.describe()

# ****DATA CLEANING****

****HANDLING NULL VALUES****

In [None]:
for feature in df_train.columns:
    percentage_of_null_values = df_train[feature].isnull().sum()/len(df_train[feature])*100
    print(f'There is a total number of {percentage_of_null_values} % null values in {feature}')

In [None]:
from sklearn.impute import SimpleImputer
for feature in df_train.columns:
    if df_train[feature].dtypes == 'object':
        imp = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
        imp.fit(df_train[[feature]])
        df_train[[feature]] = imp.transform(df_train[[feature]])
    if df_train[feature].dtypes == 'float64':
        imp2 = SimpleImputer(missing_values = np.nan, strategy = 'median')
        imp2.fit(df_train[[feature]])
        df_train[[feature]] = imp2.transform(df_train[[feature]])

In [None]:
for feature in df_train.columns:
    percentage_of_null_values = df_train[feature].isnull().sum()/len(df_train[feature])*100
    print(f'There is a total number of {percentage_of_null_values} % null values in {feature}')

****UNIQUE VALUES ANALYSIS****

In [None]:
for feature in df_train.columns:
    unique_values = df_train[feature].nunique()
    print(f'The number of unique values in {feature} is {unique_values}')

# ****FEATURE EXTRACTION AND FEATURE ENGINEERING****

****DROPPING UNNECESSARY FEATURES WHICH DON'T CONTRIBUTE MUCH TO THE DATASETS****

In [None]:
df_train = df_train.drop_duplicates(subset = 'PassengerId')

In [None]:
df_train = df_train.drop(columns = 'PassengerId')

In [None]:
df_train.loc[df_train['Name'].duplicated() == True].head(10)

In [None]:
df_train.drop(df_train[(df_train['RoomService'] == 0) & (df_train['FoodCourt'] == 0) & (df_train['ShoppingMall'] == 0) & (df_train['Name'] == 'Alraium Disivering')].index, inplace = True)

In [None]:
df_train = df_train.drop(columns = 'Name')

****FEATURE ENGINEERING THE CABIN FEATURE****

This process was done to add more variances to our datasets

In [None]:
df_train['CabinDeck'] = df_train['Cabin'].str[0] 

df_train['CabinSide'] = df_train['Cabin'].str[-1]

df_train = df_train.drop(columns = 'Cabin')

In [None]:
labels = ['Young People', 'Millenials', 'Middle-Ages', 'Old People']

df_train['AgeGroup'] = pd.cut(df_train.Age, bins = 4, labels = labels, include_lowest = True)

df_train['AgeGroup'] = df_train['AgeGroup'].astype('object')

df_train = df_train.drop(columns = 'Age')

****SEPERATING NUMERICAL AND CATEGORICAL VARIABLES****

In [None]:
numerical_variables = []
categorical_variables = []
for feature in df_train.columns:
    if df_train[feature].dtypes == 'object':
        categorical_variables.append(feature)
    elif df_train[feature].dtypes == 'float64':
        numerical_variables.append(feature)

In [None]:
for cat in categorical_variables:
    uniques = df_train[cat].unique()
    print(f'There are {uniques} in {cat}')

# ****UNIVARIATE ANALYSIS****

****NUMERICAL VARIABLES****

*HANDLING OUTLIERS*

In [None]:
for num in numerical_variables:
    plt.figure(figsize = (14,10))
    sns.kdeplot(x = df_train[num], shade = True)
    plt.title(f'Distribution of {num}', size = 18)

Conclusion: They are highly skewed right due to the fact that they are a set of zero-inflated samples. We must come up with a different approach to this situation

The best solution is Data discretization. But first, we need to see if any models made considerable contributions to our target variable by using anova_test at the later part of the project

In [None]:
df_train[numerical_variables].plot(subplots = True, kind = 'box', layout = (4,4), figsize = (12,14), patch_artist =True)

****CATEGORICAL VARIABLES****

In [None]:
for cat in categorical_variables:
    if df_train[cat].nunique() < 4:
        plt.figure(figsize = (14,10))
        sns.countplot(x = df_train[cat], palette = 'ch:s=-.2,r=.6')
        plt.xlabel(f'{cat}', size = 16)
        plt.title(f'Distribution of {cat} values', size = 28)
    elif df_train[cat].nunique() == 8:
        plt.figure(figsize = (14,10))
        order = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']
        sns.countplot(x = df_train[cat], palette = 'ch:s=-.2,r=.6', order = order)
        plt.xlabel(f'{cat}', size = 16)
        plt.title(f'Distribution of {cat}', size = 28)
    elif df_train[cat].nunique() == 4:
        plt.figure(figsize = (14,10))
        order = ['Young People', 'Millenials', 'Middle-Ages', 'Old People']
        sns.countplot(x = df_train[cat], palette = 'ch:s=-.2,r=.6')
        plt.xlabel(f'{cat}', size = 16)
        plt.title(f'Distribution of {cat}', size = 28)

In [None]:
plt.figure(figsize = (14,10))
sns.countplot(x = df_train['Transported'], palette = 'ch:s=-.2,r=.6')
plt.xlabel('Transported', size = 16)
plt.title('Distribution of Transported values', size = 28)
plt.show()

# **CORRELATION ANALYSIS**

In [None]:
corr = df_train[numerical_variables].corr()
corr 

In [None]:
sns.clustermap(data = corr, cmap = 'rocket', annot = True)

Due to the existence zero-inflated data. These continous variables don't have any specific relationships

**CORRELATION ANALYSIS BETWEEN INDEPENDENT VARIABLES AND TARGET VARIABLES**

**CHI-SQUARE TEST**

In [None]:
from itertools import product

target_var = ["Transported"]

list1 = list(product(categorical_variables, target_var, repeat = 1))

list1

import scipy.stats as ss

result = []

for i in list1:
    result.append((i[0],i[1],list(ss.chi2_contingency(pd.crosstab(df_train[i[0]], df_train[i[1]])))[1]))
    
chi_2_table = pd.DataFrame(result, columns = ['Independent Variables', 'Target Variables', 'p_value'])

def refine_this(x):
    if x <= 0.05:
        return 'Yes'
    else:
        return 'No'

chi_2_table['Selection'] = chi_2_table['p_value'].apply(refine_this)

chi_2_table

We choose all the variables for building our models

To accurately perform data discretization on continous zero-inflated data of this model. We must first use ANOVA_TEST to test its importance on predicting our target variable

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

anva_test = []

for i in numerical_variables:
    anva_test.append(i + '~' + 'Transported')

anv1 = ols(anva_test[0], data = df_train).fit()
anova1 = sm.stats.anova_lm(anv1, typ = 2).T
anv2 = ols(anva_test[1], data = df_train).fit()
anova2 = sm.stats.anova_lm(anv2, typ = 2).T
anv3 = ols(anva_test[2], data = df_train).fit()
anova3 = sm.stats.anova_lm(anv3, typ = 2).T
anv4 = ols(anva_test[3], data = df_train).fit()
anova4 = sm.stats.anova_lm(anv4, typ = 2).T
anv5 = ols(anva_test[4], data = df_train).fit()
anova5 = sm.stats.anova_lm(anv5, typ = 2).T

In [None]:
result = pd.concat([anova1, anova2, anova3, anova4, anova5], axis=1, join='inner')

In [None]:
result.columns = ['RoomService','Resiuduals1', 'ShoppingMall','Residuals2','FoodCourt', 'Residuals3', 'Spa', 'Residuals4','VRDeck', 'Residuals5']

In [None]:
result

Because the FoodCourt feature has literally no important impact on predicting our target_variable. We will perform data discretization on it to improve

In [None]:
df_train['FoodCourtBySpending'] = pd.cut(x = df_train['FoodCourt'], bins = [0, 76, 10000, 15000, 29813], labels = ['Small','Average','Above Average', 'High'])

In [None]:
df_train['FoodCourtBySpending'] = df_train['FoodCourtBySpending'].astype('object')

df_train['FoodCourtBySpending'] = df_train['FoodCourtBySpending'].fillna('Zero')

After performing data discretization, we will perform chi-square test to test its importance on predicting the target_variable

In [None]:
chi2_table = []

feats = ('FoodCourtBySpending', 'Transported')

chi2_table.append((feats[0], feats[1], list(ss.chi2_contingency(pd.crosstab(df_train[feats[0]], df_train[feats[1]])))[1]))

chi2_table = pd.DataFrame(chi2_table, columns = ['FoodCourtBySpending', 'Transported', 'p_value'])

chi2_table['Selection'] = chi2_table['p_value'].apply(refine_this)

chi2_table

So, we will keep the FoodCourtSpending feature as our new feature

In [None]:
df_train = df_train.drop(columns = ['FoodCourt'])

# **BUILDING OUR MODELS**

**DATA PREPROCESSING**

In [None]:
X = df_train.select_dtypes(include = ['object', 'float'], exclude = 'bool')
y = df_train['Transported']

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [None]:
X = pd.get_dummies(X, columns = ['HomePlanet', 'Destination', 'CabinDeck', 'CabinSide', 'AgeGroup'])

In [None]:
oe = OrdinalEncoder()

X[['CryoSleep', 'VIP']] = oe.fit_transform(X[['CryoSleep', 'VIP']])

In [None]:
categories_in_order = ['Zero', 'Small', 'Average', 'Above Average', 'High']


oe2 = OrdinalEncoder(categories = list(np.array(categories_in_order).reshape(1,5)))


X[['FoodCourtBySpending']] = oe2.fit_transform(X[['FoodCourtBySpending']])

In [None]:
y = oe.fit_transform(np.array(y).reshape(-1,1))

In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

X[['RoomService', 'ShoppingMall', 'Spa', 'VRDeck']] = ss.fit_transform(X[['RoomService', 'ShoppingMall', 'Spa', 'VRDeck']])

Our dataset after preprocessing:

In [None]:
X

**SEPERATING OUR DATASETS INTO TRAINING AND TESTING**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42)

**BUILDING AND TESTING PRE-MODELS**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('DTC', DecisionTreeClassifier()))
models.append(('Naive_Bayes', GaussianNB()))
models.append(('RDT', RandomForestClassifier()))

In [None]:
models

In [None]:
results = []
names = []

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

for name, model in models:
    kfold = KFold(n_splits = 10, random_state = 7, shuffle = True)
    cv_results = cross_val_score(model, X_train, y_train.ravel(), cv = kfold, scoring = 'neg_mean_squared_error')
    results.append(cv_results)
    names.append(name)

In [None]:
fig = plt.figure(figsize = (10,10))
plt.title('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
plt.xticks(ticks = [1,2,3,4], labels = names, rotation = 45)
plt.show()

From the previous part. We can easily see that RandomForest and LogisticRegression are two best-performing models of all. We will choose both of them to evaluate and tune

# **TUNING AND EVALUATING OUR MODELS**

**LOGISTIC REGRESSION**

In [None]:
lr = LogisticRegression()

lr.fit(X_train, y_train)

lrprediction = lr.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, lrprediction))

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(lrprediction, y_test)

def plot_confusion_matrix(cm, classes, title = "Confusion_Matrix"):
    sns.heatmap(cm, xticklabels = classes, yticklabels = classes, annot = True, fmt = 'd')
    plt.title(title, size = 22)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')

plot_confusion_matrix(cm, classes = lr.classes_, title = "Confusion_Matrix")

The number of right predictions on whether our passengers were transported or not is much more important. So we will focus more on precision score of our models

In [None]:
FP = cm.sum(axis = 0) - np.diag(cm)
FN = cm.sum(axis = 1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

*SENSITIVITY SCORE*

In [None]:
Sensitivity = TP / (TP + FN)
print('Sensitivity of this model is', Sensitivity)

*PRECISION SCORE*

In [None]:
Precision = TP / (TP + FP)
print('Precision of this model is', Precision)

*FALSE POSITIVE RATE*

In [None]:
FPR = FP / (FP + TN)
print('False Positive Rate of this model is', FPR)

*FALSE NEGATIVE RATE*

In [None]:
FNR = FN / (FN + TP)
print('False Negative Rate of this model is', FNR)

*WE WILL OPTIMIZE OUR MODEL WITH THE INVERSE REGULARIZATION PARAMETER*

In [None]:
from sklearn.metrics import log_loss

C_list = np.geomspace(1e-5, 1e5, num = 20)

CA = []

log_loss_score = []

Precision_scores = []

for c in C_list:
    lr2 = LogisticRegression(C = c)
    lr2.fit(X_train, y_train)
    CA_score = lr2.score(X_test, y_test)
    CA.append(CA_score)
    print(f'The Accuracy Score for this model with c = {c} is {CA_score}')
    predict_prob = lr2.predict_proba(X_test)
    ll = log_loss(y_test, predict_prob)
    log_loss_score.append(ll)
    print(f'The Log Loss Score for this model with c = {c} is', ll)
    cm = confusion_matrix(lr2.predict(X_test), y_test)
    plot_cm = plot_confusion_matrix(cm, classes = lr2.classes_, title = "Confusion_Matrix")
    TP = np.diag(cm)
    FP = cm.sum(axis = 0) - TP
    Precision = TP / (TP + FP)
    Precision_scores.append(Precision[1])
    print(f'The precision score for this model with c = {c} is', Precision[1])

In [None]:
df_outcomes = zip(C_list.reshape(20,1), np.array(CA).reshape(20,1), np.array(log_loss_score).reshape(20,1), np.array(Precision_scores).reshape(20,1))

df_outcomes = pd.DataFrame(df_outcomes, columns = ['Inverse Regularization', 'Accuracy Score', 'Log Loss Score', 'Precision'])

df_outcomes.sort_values(by = ['Log Loss Score'], ascending = True)

The best model is:

In [None]:
lr2 = LogisticRegression(random_state = 10, C = 0.1623776739188721)

lr2.fit(X_train, y_train)

print(classification_report(y_test, lr2.predict(X_test)))

In [None]:
plot_confusion_matrix(confusion_matrix(lr2.predict(X_test), y_test), classes = lr2.classes_)

**RANDOM FOREST**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(X_train, y_train)

print(classification_report(y_test, rf.predict(X_test)))

In [None]:
plot_confusion_matrix(cm = confusion_matrix(y_test, rf.predict(X_test)), classes = rf.classes_)

*WE WILL USE GRIDSEARCHCV TO OPTIMIZE OUR MODEL*

In [None]:
#Tuning it:
#Number of trees in a randomforest:

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 200, num = 10)]

#Number of features at each split:

max_features = ['auto', 'sqrt']

#Number of levels in the trees:

max_depth = [None, 2, 4]

#Minimum number of samples needed to split a node:

min_samples_split = [2, 4]

#Minimum number of samples needed at each node:

min_samples_leaf = [1,2]

#Choosing Bootstrap or Not

bootstrap = [True, False]

In [None]:
random_grid = {'n_estimators' : n_estimators,
             'max_features' : max_features,
             'max_depth' : max_depth,
             'min_samples_split' : min_samples_split,
             'min_samples_leaf' : min_samples_leaf,
             'bootstrap' : bootstrap}
print(random_grid)

In [None]:
from sklearn.model_selection import GridSearchCV

rf_grid = GridSearchCV(estimator = rf, param_grid = random_grid, cv = 10, verbose = 2, n_jobs = 4)

In [None]:
rf_grid.fit(X_train, y_train.ravel())

In [None]:
rf_grid.best_params_

In [None]:
rf_grid_best = RandomForestClassifier(bootstrap = True,
 max_depth = None,
 max_features = 'sqrt',
 min_samples_leaf = 2,
 min_samples_split = 4,
 n_estimators = 111)

In [None]:
rf_grid_best.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, rf_grid_best.predict(X_test)))

THEN, WE WILL CHOOSE THESE TWO MODELS TO PREDICT

In [None]:
#There are two valid models:

RandomForestClassifier(bootstrap = True, max_depth = None, max_features = 'sqrt', min_samples_leaf = 2, min_samples_split = 4, n_estimators = 200)

RandomForestClassifier()

LogisticRegression(random_state = 10, C = 0.1623776739188721)