In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
org_train_data = pd.read_csv("data/Tabular Playground Series - Apr 2021/train.csv")
org_test_data = pd.read_csv("data/Tabular Playground Series - Apr 2021/test.csv")

In [None]:
org_train_data.head(5)

In [None]:
org_test_data.head(5)

In [None]:
# Check for Null data

In [None]:
org_train_data.isnull().sum()

In [None]:
org_test_data.isnull().sum()

In [None]:
# Check % of Null data

In [None]:
def missingdata(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum() / data.isnull().count() * 100).sort_values(
        ascending=False
    )
    ms = pd.concat([total, percent], axis=1, keys=["Total", "Percent"])
    ms = ms[ms["Percent"] > 0]
    f, ax = plt.subplots(figsize=(8, 6))
    plt.xticks(rotation="90")

    fig = sns.barplot(ms.index, ms["Percent"], color="green", alpha=0.8)
    plt.xlabel("Features", fontsize=15)
    plt.ylabel("Percent of missing values", fontsize=15)
    plt.title("Percent missing data by feature", fontsize=15)
    return ms

In [None]:
missingdata(org_train_data)

In [None]:
missingdata(org_test_data)

In [None]:
# Drop unnecessary columns. Cabin has 70% Null values in bothe Train and Test data.

In [None]:
# Drop from Train Dataset

org_train_data.drop(["PassengerId", "Cabin"], axis=1, inplace=True)
org_train_data.head(10)

In [None]:
# Drop from Test Dataset
org_test_pssg_id = org_test_data["PassengerId"]
org_test_data.drop(["PassengerId", "Cabin"], axis=1, inplace=True)
org_test_data.head(10)

In [None]:
def fill_age(data):
    # Populate Missing Age - Has SibSp and Parch
    # Age for passanger travelling with SibSp and Parch

    # Find Median Age for passanger travelling with SibSp and Parch
    age_median_sibsp_parch = data[(data["Parch"] > 0) & (data["SibSp"] > 0)][
        "Age"
    ].median()

    # Populate Age for passanger travelling with SibSp and Parch where Age is Null
    data.loc[
        (data["Parch"] > 0) & (data["SibSp"] > 0) & (data["Age"].isnull()), "Age"
    ] = age_median_sibsp_parch

    # Populate Missing Age - Has SibSp and NO Parch
    # Age for passanger travelling with SibSp and NO Parch

    # Find Median Age for passanger travelling with SibSp and NO Parch
    age_median_sibsp_no_parch = data[(data["Parch"] == 0) & (data["SibSp"] > 0)][
        "Age"
    ].median()

    # Populate Age for passanger travelling with SibSp and NO Parch where Age is Null
    data.loc[
        (data["Parch"] == 0) & (data["SibSp"] > 0) & (data["Age"].isnull()), "Age"
    ] = age_median_sibsp_no_parch

    # Populate Missing Age - Has NO SibSp and NO Parch
    # Age for passanger travelling with NO SibSp and NO Parch

    # Find Median Age for passanger travelling with NO SibSp and NO Parch
    age_median_no_sibsp_no_parch = data[(data["Parch"] == 0) & (data["SibSp"] == 0)][
        "Age"
    ].median()

    # Populate Age for passanger travelling with NO SibSp and NO Parch where Age is Null
    data.loc[
        (data["Parch"] == 0) & (data["SibSp"] == 0) & (data["Age"].isnull()), "Age"
    ] = age_median_no_sibsp_no_parch

    # Populate Missing Age - Has NO SibSp and Has Parch
    # Age for passanger travelling with NO SibSp and Has Parch

    # Find Median Age for passanger travelling with NO SibSp and Has Parch
    age_median_no_sibsp_no_parch = data[(data["Parch"] > 0) & (data["SibSp"] == 0)][
        "Age"
    ].median()

    # Populate Age for passanger travelling with NO SibSp and Has Parch where Age is Null
    data.loc[
        (data["Parch"] > 0) & (data["SibSp"] == 0) & (data["Age"].isnull()), "Age"
    ] = age_median_no_sibsp_no_parch

    return data

In [None]:
# Fill Age for Train and Test Dataset
org_train_data = fill_age(org_train_data)
org_test_data = fill_age(org_test_data)

In [None]:
# Fill Fare for Test and Train Dataset
org_train_data["Fare"].fillna(org_train_data["Fare"].mode()[0], inplace=True)
org_test_data["Fare"].fillna(org_train_data["Fare"].mode()[0], inplace=True)

In [None]:
# Check for Tickets and add Has_ticket
org_train_data["Has_Ticket"] = np.where(org_train_data["Ticket"].isnull(), 0, 1)
org_test_data["Has_Ticket"] = np.where(org_test_data["Ticket"].isnull(), 0, 1)

# Drop Ticket
org_train_data.drop("Ticket", axis=1, inplace=True)
org_test_data.drop("Ticket", axis=1, inplace=True)

In [None]:
# Fill Embarked for Test and Train Dataset
org_train_data["Embarked"].fillna(org_train_data["Embarked"].mode()[0], inplace=True)
org_test_data["Embarked"].fillna(org_train_data["Embarked"].mode()[0], inplace=True)

In [None]:
# Apply One Hot Encoding to Train Dataset

In [None]:
# Sex
def encode_sex(data):
    return pd.get_dummies(data["Sex"], drop_first=True)


train_sex = encode_sex(org_train_data)
test_sex = encode_sex(org_test_data)

In [None]:
# Pclass
def encode_pclass(data):
    return pd.get_dummies(data["Pclass"], drop_first=True)


train_pclass = encode_pclass(org_train_data)
test_pclass = encode_pclass(org_test_data)

In [None]:
# Embarked
def encode_embarked(data):
    return pd.get_dummies(org_train_data["Embarked"], drop_first=True)


train_embarked = encode_embarked(org_train_data)
test_embarked = encode_embarked(org_test_data)

In [None]:
# Add Encoded Sex, embarked, pclass to Train Dataset
org_train_data = pd.concat(
    [org_train_data, train_sex, train_embarked, train_pclass], axis=1
)
org_test_data = pd.concat([org_test_data, test_sex, test_embarked, test_pclass], axis=1)

In [None]:
# Drop Sex, embarked, pclass to Train Dataset
org_train_data.drop(["Pclass", "Sex", "Embarked"], axis=1, inplace=True)
org_test_data.drop(["Pclass", "Sex", "Embarked"], axis=1, inplace=True)

In [None]:
# Drop Sex, embarked, pclass to Train Dataset
org_train_data.drop(["Name"], axis=1, inplace=True)
org_test_data.drop(["Name"], axis=1, inplace=True)

In [None]:
org_train_data.head(5)

In [None]:
org_test_data.head(5)

In [None]:
training_feature = org_train_data.drop("Survived", axis=1)
training_target = org_train_data["Survived"]

In [None]:
sns.heatmap(
    training_feature.corr(), annot=True, cmap="RdYlGn", linewidths=0.2
)  # data.corr()-->correlation matrix
fig = plt.gcf()
fig.set_size_inches(20, 12)
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()
training_feature = standard_scaler.fit_transform(training_feature)

In [None]:
training_feature

In [None]:
training_target

In [None]:
training_feature.info()

In [None]:
org_test_data.info()

**Model**

Now we are ready to train a model and predict the required solution. There are lot of predictive modelling algorithms to choose from. We must understand the type of problem and solution requirement to narrow down to a select few models which we can evaluate. Our problem is a classification and regression problem. We want to identify relationship between output (Survived or not) with other variables or features (Gender, Age, Port...). We are also perfoming a category of machine learning which is called supervised learning as we are training our model with a given dataset. With these two criteria - Supervised Learning plus Classification and Regression, we can narrow down our choice of models to a few. These include:

- Logistic Regression
- KNN
- Support Vector Machines
- Naive Bayes classifier
- Decision Tree
- Random Forrest
- Linear Discriminant Analysis
- Ada Boost Classifier
- Gradient Boosting Classifier

And also compared above given classifiers and evaluate the mean accuracy of each of them by a stratified kfold cross validation procedure and plot accuracy based confusion matrix

## Split Data to Test and Train

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import (
    KFold,
    cross_val_predict,
    cross_val_score,
    train_test_split,
)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    training_feature, training_target, test_size=0.2, random_state=42
)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
x_train

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression()
logistic_regression.fit(x_train, y_train)
lr_prediction = logistic_regression.predict(x_test)

print("-------------- The Accuracy of the model ----------------------------")
print(
    "The Accuracy of the Logistic Regression is",
    round(accuracy_score(y_test, lr_prediction) * 100, 2),
)

# k=10, split the data into 10 equal parts
kfold = KFold(n_splits=10, shuffle=True, random_state=22)

lr_cv_score = cross_val_score(
    logistic_regression, training_feature, training_target, cv=kfold, scoring="accuracy"
)

print(
    "The cross validated score for Logistic Regression is:",
    round(lr_cv_score.mean() * 100, 2),
)

y_pred = cross_val_predict(
    logistic_regression, training_feature, training_target, cv=kfold
)

sns.heatmap(
    confusion_matrix(training_target, y_pred), annot=True, fmt="3.0f", cmap="summer"
)

plt.title("Logistic Regression Confusion Matrix", y=1.05, size=15)

## LogisticRegression HyperParameter Tuning

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

logistic_regression = LogisticRegression()

kfold = KFold(n_splits=10, shuffle=True, random_state=22)

param_grid = {
    "penalty": ["l2", "l1"],
    "C": [100, 10, 1.0, 0.1, 0.01],
}

lr_model = GridSearchCV(
    estimator=logistic_regression,
    param_grid=param_grid,
    cv=kfold,
    n_jobs=4,
    scoring="accuracy",
)

lr_model.fit(x_train, y_train)

# Best score
print(lr_model.best_score_)

# Best Estimator
lr_model.best_estimator_

## LogisticRegression with Best Parameters

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression(
    C=1.0,
    class_weight=None,
    dual=False,
    fit_intercept=True,
    intercept_scaling=1,
    max_iter=100,
    multi_class="warn",
    n_jobs=None,
    penalty="l1",
    random_state=None,
    solver="warn",
    tol=0.0001,
    verbose=0,
    warm_start=False,
)

logistic_regression.fit(x_train, y_train)

lr_prediction = logistic_regression.predict(x_test)

print("-------------- The Accuracy of the model ----------------------------")
print(
    "The Accuracy of the Logistic Regression is",
    round(accuracy_score(y_test, lr_prediction) * 100, 2),
)

# k=10, split the data into 10 equal parts
kfold = KFold(n_splits=10, shuffle=True, random_state=22)

lr_cv_score = cross_val_score(
    logistic_regression, training_feature, training_target, cv=kfold, scoring="accuracy"
)

print(
    "The cross validated score for Logistic Regression is:",
    round(lr_cv_score.mean() * 100, 2),
)

y_pred = cross_val_predict(
    logistic_regression, training_feature, training_target, cv=kfold
)

sns.heatmap(
    confusion_matrix(training_target, y_pred), annot=True, fmt="3.0f", cmap="summer"
)

plt.title("Logistic Regression Confusion Matrix", y=1.05, size=15)

## RandomForestClassifier Hyper-Parameter Tunning

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

random_forest = RandomForestClassifier()

n_estimator = range(500, 800, 100)

param_grid = {"n_estimators": n_estimator}

# k=10, split the data into 10 equal parts
kfold = KFold(n_splits=10, shuffle=True, random_state=22)

rf_classisier_cv = GridSearchCV(
    random_forest,
    param_grid=param_grid,
    cv=kfold,
    scoring="accuracy",
    n_jobs=4,
    verbose=1,
)

rf_classisier_cv.fit(x_train, y_train)


# Best score
print(rf_classisier_cv.best_score_)

# Best Estimator
rf_classisier_cv.best_estimator_

## RandomForestClassifier with Best Parameters

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(
    criterion="gini",
    n_estimators=700,
    min_samples_split=10,
    min_samples_leaf=1,
    max_features="auto",
    oob_score=True,
    random_state=1,
    n_jobs=-1,
)
random_forest.fit(x_train, y_train)

rf_prediction = random_forest.predict(x_test)

print("-------------- The Accuracy of the model ----------------------------")
print(
    "The Accuracy of the Random Forest is",
    round(accuracy_score(y_test, rf_prediction) * 100, 2),
)

# k=10, split the data into 10 equal parts
kfold = KFold(n_splits=10, shuffle=True, random_state=22)

rf_cv_score = cross_val_score(
    random_forest, training_feature, training_target, cv=kfold, scoring="accuracy"
)

print(
    "The cross validated score for Random Forest is:",
    round(rf_cv_score.mean() * 100, 2),
)

y_pred = cross_val_predict(random_forest, training_feature, training_target, cv=kfold)

sns.heatmap(
    confusion_matrix(training_target, y_pred), annot=True, fmt="3.0f", cmap="summer"
)

plt.title("Random Forest Confusion Matrix", y=1.05, size=15)

## Support Vector Machines

In [None]:
# from sklearn.svm import SVC, LinearSVC

# svc = SVC()
# svc.fit(x_train, y_train)
# svc_prediction = svc.predict(x_test)

# print('-------------- The Accuracy of the model ----------------------------')
# print('The Accuracy of the SVC is', round(accuracy_score(y_test, svc_prediction) * 100, 2))

# # k=10, split the data into 10 equal parts
# kfold = KFold(n_splits=10, shuffle=True, random_state=22)

# svc_cv_score = cross_val_score(svc, training_feature, training_target, cv=kfold, scoring='accuracy')

# print('The cross validated score for SVC is:', round(svc_cv_score.mean() * 100, 2))

# y_pred = cross_val_predict(svc, training_feature, training_target, cv=kfold)

# sns.heatmap(confusion_matrix(training_target, y_pred), annot=True, fmt='3.0f', cmap="summer")

# plt.title('SVC Confusion Matrix', y=1.05, size=15)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(
    criterion="gini", min_samples_split=10, min_samples_leaf=1, max_features="auto"
)

dt_classifier.fit(x_train, y_train)

dt_prediction = dt_classifier.predict(x_test)

print("-------------- The Accuracy of the model ----------------------------")
print(
    "The Accuracy of the Decision Tree is",
    round(accuracy_score(y_test, dt_prediction) * 100, 2),
)

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

dt_cv_score = cross_val_score(
    dt_classifier, training_feature, training_target, cv=kfold, scoring="accuracy"
)

print(
    "The cross validated score for Decision Tree is:",
    round(dt_cv_score.mean() * 100, 2),
)

y_pred = cross_val_predict(dt_classifier, training_feature, training_target, cv=kfold)

sns.heatmap(
    confusion_matrix(training_target, y_pred), annot=True, fmt="3.0f", cmap="summer"
)

plt.title("Decision Tree Confusion Matrix", y=1.05, size=15)

## Decision Tree HyperParameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

param_grid = {"max_depth": [5, 10, 15], "max_features": [5, 7, 10]}

dt_model = GridSearchCV(
    estimator=decision_tree,
    param_grid=param_grid,
    cv=kfold,
    n_jobs=4,
    scoring="accuracy",
)

dt_model.fit(x_train, y_train)

# Best score
print(dt_model.best_score_)

# Best Estimator
dt_model.best_estimator_

## DecesionTree with Best parameters

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(
    class_weight=None,
    criterion="gini",
    max_depth=5,
    max_features=10,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=1,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    presort=False,
    random_state=None,
    splitter="best",
)

dt_classifier.fit(x_train, y_train)

dt_prediction = dt_classifier.predict(x_test)

print("-------------- The Accuracy of the model ----------------------------")
print(
    "The Accuracy of the Decision Tree is",
    round(accuracy_score(y_test, dt_prediction) * 100, 2),
)

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

dt_cv_score = cross_val_score(
    dt_classifier, training_feature, training_target, cv=kfold, scoring="accuracy"
)

print(
    "The cross validated score for Decision Tree is:",
    round(dt_cv_score.mean() * 100, 2),
)

y_pred = cross_val_predict(dt_classifier, training_feature, training_target, cv=kfold)

sns.heatmap(
    confusion_matrix(training_target, y_pred), annot=True, fmt="3.0f", cmap="summer"
)

plt.title("Decision Tree Confusion Matrix", y=1.05, size=15)

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada_classifier = AdaBoostClassifier()
ada_classifier.fit(x_train, y_train)
ada_prediction = ada_classifier.predict(x_test)

print("-------------- The Accuracy of the model ----------------------------")
print(
    "The Accuracy of the ADA Boost Classifier is",
    round(accuracy_score(y_test, ada_prediction) * 100, 2),
)

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

ada_cv_score = cross_val_score(
    ada_classifier, training_feature, training_target, cv=kfold, scoring="accuracy"
)

print(
    "The cross validated score for ADA Boost Classifier is:",
    round(ada_cv_score.mean() * 100, 2),
)

y_pred = cross_val_predict(ada_classifier, training_feature, training_target, cv=kfold)

sns.heatmap(
    confusion_matrix(training_target, y_pred), annot=True, fmt="3.0f", cmap="summer"
)

plt.title("ADA Boost Classifier Confusion Matrix", y=1.05, size=15)

## AdaBoost Hyperparameter Tuning

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

ada_classifier = AdaBoostClassifier()

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

param_grid = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.5],
}

ada_model = GridSearchCV(
    ada_classifier,
    param_grid=param_grid,
    cv=kfold,
    scoring="accuracy",
    n_jobs=4,
    verbose=1,
)

ada_model.fit(x_train, y_train)

# Best score
print(ada_model.best_score_)

# Best Estimator
ada_model.best_estimator_

## AdaBoostClassifier with Best Paramaters

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada_classifier = AdaBoostClassifier(
    algorithm="SAMME.R",
    base_estimator=None,
    learning_rate=0.5,
    n_estimators=200,
    random_state=None,
)

ada_classifier.fit(x_train, y_train)
ada_prediction = ada_classifier.predict(x_test)

print("-------------- The Accuracy of the model ----------------------------")
print(
    "The Accuracy of the ADA Boost Classifier is",
    round(accuracy_score(y_test, ada_prediction) * 100, 2),
)

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

ada_cv_score = cross_val_score(
    ada_classifier, training_feature, training_target, cv=kfold, scoring="accuracy"
)

print(
    "The cross validated score for ADA Boost Classifier is:",
    round(ada_cv_score.mean() * 100, 2),
)

y_pred = cross_val_predict(ada_classifier, training_feature, training_target, cv=kfold)

sns.heatmap(
    confusion_matrix(training_target, y_pred), annot=True, fmt="3.0f", cmap="summer"
)

plt.title("ADA Boost Classifier Confusion Matrix", y=1.05, size=15)

## Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda_classifier = LinearDiscriminantAnalysis()

print(lda_classifier)

lda_classifier.fit(x_train, y_train)

lda_prediction = lda_classifier.predict(x_test)

print("-------------- The Accuracy of the model ----------------------------")
print(
    "The Accuracy of the LDA Classifier is",
    round(accuracy_score(y_test, lda_prediction) * 100, 2),
)

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

lda_cv_score = cross_val_score(
    lda_classifier, training_feature, training_target, cv=kfold, scoring="accuracy"
)

print(
    "The cross validated score for LDA Classifier is:",
    round(lda_cv_score.mean() * 100, 2),
)

y_pred = cross_val_predict(lda_classifier, training_feature, training_target, cv=kfold)

sns.heatmap(
    confusion_matrix(training_target, y_pred), annot=True, fmt="3.0f", cmap="summer"
)

plt.title("LDA Classifier Confusion Matrix", y=1.05, size=15)

## LDA HyperParameters Tuning

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV

lda_classifier = LinearDiscriminantAnalysis()

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

param_grid = {
    "solver": ["lsqr", "eigen"],
    "shrinkage": [0, 0.5, 1.0],
    "tol": [0.0001, 0.001, 0.01, 0.1],
}

lda_model = GridSearchCV(
    lda_classifier,
    param_grid=param_grid,
    cv=kfold,
    scoring="accuracy",
    n_jobs=4,
    verbose=1,
)

lda_model.fit(x_train, y_train)

# Best score
print(lda_model.best_score_)

# Best Estimator
lda_model.best_estimator_

## LDA with Best Parameters

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda_classifier = LinearDiscriminantAnalysis(
    n_components=None,
    priors=None,
    shrinkage=0,
    solver="lsqr",
    store_covariance=False,
    tol=0.0001,
)

print(lda_classifier)

lda_classifier.fit(x_train, y_train)

lda_prediction = lda_classifier.predict(x_test)

print("-------------- The Accuracy of the model ----------------------------")
print(
    "The Accuracy of the LDA Classifier is",
    round(accuracy_score(y_test, lda_prediction) * 100, 2),
)

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

lda_cv_score = cross_val_score(
    lda_classifier, training_feature, training_target, cv=kfold, scoring="accuracy"
)

print(
    "The cross validated score for LDA Classifier is:",
    round(lda_cv_score.mean() * 100, 2),
)

y_pred = cross_val_predict(lda_classifier, training_feature, training_target, cv=kfold)

sns.heatmap(
    confusion_matrix(training_target, y_pred), annot=True, fmt="3.0f", cmap="summer"
)

plt.title("LDA Classifier Confusion Matrix", y=1.05, size=15)

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_classifier = GradientBoostingClassifier()

print(gb_classifier)

gb_classifier.fit(x_train, y_train)

gb_prediction = gb_classifier.predict(x_test)

print("-------------- The Accuracy of the model ----------------------------")
print(
    "The Accuracy of the GB Classifier is",
    round(accuracy_score(y_test, gb_prediction) * 100, 2),
)

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

gb_cv_score = cross_val_score(
    gb_classifier, training_feature, training_target, cv=kfold, scoring="accuracy"
)

print(
    "The cross validated score for GB Classifier is:",
    round(gb_cv_score.mean() * 100, 2),
)

y_pred = cross_val_predict(gb_classifier, training_feature, training_target, cv=kfold)

sns.heatmap(
    confusion_matrix(training_target, y_pred), annot=True, fmt="3.0f", cmap="summer"
)

plt.title("GB Classifier Confusion Matrix", y=1.05, size=15)

## Gradient Boost HyperParameter Tuning

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

gb_classifier = GradientBoostingClassifier()
param_grid = {
    "loss": ["deviance"],
    "n_estimators": [100, 200, 300, 400],
    "learning_rate": [0.01, 0.1, 1.0],
    "max_depth": [4, 8],
    "min_samples_leaf": [100, 150],
    "max_features": [5, 10],
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

gb_model = GridSearchCV(
    gb_classifier,
    param_grid=param_grid,
    cv=kfold,
    scoring="accuracy",
    n_jobs=4,
    verbose=1,
)

gb_model.fit(x_train, y_train)

# Best score
print(gb_model.best_score_)

# Best Estimator
gb_model.best_estimator_

In [None]:
gb_model.best_estimator_.feature_importances_

In [None]:
d = {"Stats": x_train.columns, "Feature": gb_model.best_estimator_.feature_importances_}
df = pd.DataFrame(d)
df

## Gradient Boost with Best Parameters

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_classifier = GradientBoostingClassifier(
    criterion="friedman_mse",
    init=None,
    learning_rate=0.1,
    loss="deviance",
    max_depth=8,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=150,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_iter_no_change=None,
    presort="auto",
    random_state=None,
    subsample=1.0,
    tol=0.0001,
    validation_fraction=0.1,
    verbose=0,
    warm_start=False,
)

print(gb_classifier)

gb_classifier.fit(x_train, y_train)

gb_prediction = gb_classifier.predict(x_test)

print("-------------- The Accuracy of the model ----------------------------")
print(
    "The Accuracy of the GB Classifier is",
    round(accuracy_score(y_test, gb_prediction) * 100, 2),
)

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

gb_cv_score = cross_val_score(
    gb_classifier, training_feature, training_target, cv=kfold, scoring="accuracy"
)

print(
    "The cross validated score for GB Classifier is:",
    round(gb_cv_score.mean() * 100, 2),
)

y_pred = cross_val_predict(gb_classifier, training_feature, training_target, cv=kfold)

sns.heatmap(
    confusion_matrix(training_target, y_pred), annot=True, fmt="3.0f", cmap="summer"
)

plt.title("GB Classifier Confusion Matrix", y=1.05, size=15)

## Model Evaluation Scores

In [None]:
models = pd.DataFrame(
    {
        "Model": [
            "LogisticRegression",
            "RandomForestClassifier",
            "DecisionTreeClassifier",
            "AdaBoostClassifier",
            "LinearDiscriminantAnalysis",
            "GradientBoostingClassifier",
        ],
        "Score": [
            lr_cv_score.mean(),
            rf_cv_score.mean(),
            dt_cv_score.mean(),
            ada_cv_score.mean(),
            lda_cv_score.mean(),
            gb_cv_score.mean(),
        ],
    }
)
models.sort_values(by="Score", ascending=False)

# Applying Feature Selection 

In [None]:
x_train = pd.DataFrame(x_train)
x_train

### Information Gain

In [None]:
from sklearn.feature_selection import mutual_info_classif
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

importances = mutual_info_classif(x_train, y_train)
feat_importances = pd.Series(importances, x_train.columns[0:len(x_train.columns)])

In [None]:
feat_importances.sort_values(ascending=False)

In [None]:
feat_importances.plot(kind="bar", color="teal")
plt.show()

In [None]:
from sklearn.feature_selection import SelectKBest

sel = SelectKBest(mutual_info_classif, k=2).fit(x_train, y_train)
x_train.columns[sel.get_support()]

In [None]:
x_train_ig = sel.transform(x_train)
x_test_ig = sel.transform(x_test)
x_train_ig.shape, x_test_ig.shape

### GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_classifier = GradientBoostingClassifier(
    criterion="friedman_mse",
    init=None,
    learning_rate=0.1,
    loss="deviance",
    max_depth=5,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=200,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=300,
    n_iter_no_change=None,
    presort="auto",
    random_state=None,
    subsample=1.0,
    tol=0.0001,
    validation_fraction=0.1,
    verbose=0,
    warm_start=False,
)

print(gb_classifier)

gb_classifier.fit(x_train, y_train)

gb_prediction = gb_classifier.predict(x_test)

print("-------------- The Accuracy of the model ----------------------------")
print(
    "The Accuracy of the GB Classifier is",
    round(accuracy_score(y_test, gb_prediction) * 100, 2),
)

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

gb_cv_score = cross_val_score(
    gb_classifier, training_feature, training_target, cv=kfold, scoring="accuracy"
)

print(
    "The cross validated score for GB Classifier is:",
    round(gb_cv_score.mean() * 100, 2),
)

y_pred = cross_val_predict(gb_classifier, training_feature, training_target, cv=kfold)

sns.heatmap(
    confusion_matrix(training_target, y_pred), annot=True, fmt="3.0f", cmap="summer"
)

plt.title("GB Classifier Confusion Matrix", y=1.05, size=15)

# Chi Square

In [None]:
from sklearn.feature_selection import chi2

importances = chi2(x_train, y_train)
feat_importances = pd.Series(importances[1], index=x_train.columns)
feat_importances.sort_values(ascending=True)

In [None]:
feat_importances.plot(kind="bar", color="teal")
plt.show()

In [None]:
from sklearn.feature_selection import SelectKBest

sel = SelectKBest(chi2, k=5).fit(x_train, y_train)
x_train.columns[sel.get_support()]

In [None]:
x_train_chi = sel.transform(x_train)
x_test_chi = sel.transform(x_test)
x_train_chi.shape, x_test_chi.shape

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_classifier = GradientBoostingClassifier(
    criterion="friedman_mse",
    init=None,
    learning_rate=0.1,
    loss="deviance",
    max_depth=5,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=200,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=300,
    n_iter_no_change=None,
    presort="auto",
    random_state=None,
    subsample=1.0,
    tol=0.0001,
    validation_fraction=0.1,
    verbose=0,
    warm_start=False,
)

print(gb_classifier)

gb_classifier.fit(x_train_chi, y_train)

gb_prediction = gb_classifier.predict(x_test_chi)

print("-------------- The Accuracy of the model ----------------------------")
print(
    "The Accuracy of the GB Classifier is",
    round(accuracy_score(y_test, gb_prediction) * 100, 2),
)

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

gb_cv_score = cross_val_score(
    gb_classifier, training_feature, training_target, cv=kfold, scoring="accuracy"
)

print(
    "The cross validated score for GB Classifier is:",
    round(gb_cv_score.mean() * 100, 2),
)

y_pred = cross_val_predict(gb_classifier, training_feature, training_target, cv=kfold)

sns.heatmap(
    confusion_matrix(training_target, y_pred), annot=True, fmt="3.0f", cmap="summer"
)

plt.title("GB Classifier Confusion Matrix", y=1.05, size=15)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_classifier = GradientBoostingClassifier(
    criterion="friedman_mse",
    init=None,
    learning_rate=0.1,
    loss="deviance",
    max_depth=5,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=200,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=300,
    n_iter_no_change=None,
    presort="auto",
    random_state=None,
    subsample=1.0,
    tol=0.0001,
    validation_fraction=0.1,
    verbose=0,
    warm_start=False,
)

print(gb_classifier)

gb_classifier.fit(x_train, y_train)

gb_prediction = gb_classifier.predict(x_test)

print("-------------- The Accuracy of the model ----------------------------")
print(
    "The Accuracy of the GB Classifier is",
    round(accuracy_score(y_test, gb_prediction) * 100, 2),
)

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

gb_cv_score = cross_val_score(
    gb_classifier, training_feature, training_target, cv=kfold, scoring="accuracy"
)

print(
    "The cross validated score for GB Classifier is:",
    round(gb_cv_score.mean() * 100, 2),
)

y_pred = cross_val_predict(gb_classifier, training_feature, training_target, cv=kfold)

sns.heatmap(
    confusion_matrix(training_target, y_pred), annot=True, fmt="3.0f", cmap="summer"
)

plt.title("GB Classifier Confusion Matrix", y=1.05, size=15)

In [None]:
submission = pd.read_csv(
    "data/Tabular Playground Series - Apr 2021/sample_submission.csv"
)

In [None]:
y_pred = gb_classifier.predict(org_test_data)

In [None]:
export_df = pd.DataFrame()
export_df["PassengerId"] = org_test_pssg_id
export_df["Survived"] = y_pred
export_df.to_csv(
    "data/Tabular Playground Series - Apr 2021/my_gb_2_classifier_submission.csv",
    index=False,
)

In [None]:
export_df

### PCA 

In [None]:
x_train.head(10)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)
x_pca = pca.fit_transform(x_train)
x_pca

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
plt.scatter(x_pca[:, 0], x_pca[:, 1], c=y_train, cmap="viridis")
plt.xlabel("First Principal Component")
plt.ylabel("Second Principal Component")
plt.title("Scatter plot for Second principal component and First principal component")
plt.show()

In [None]:
pca.explained_variance_ratio_

# ANN

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential, regularizers  # Importing sequential model
from tensorflow.keras.layers import (
    BatchNormalization,  # Importing layers
    Dense,
    Dropout,
    Flatten,
)

In [None]:
x_train_ig.shape[1], x_test_ig.shape[1]

In [None]:
def annModel(x_train, y_train, x_test, y_test, batch, epochs):
    model = Sequential()

    model.add(
        Dense(
            x_train.shape[1],
            activation="relu",
            input_dim=x_train.shape[1],
            kernel_regularizer=regularizers.l2(0.01),
        )
    )

    model.add(Dense(5, activation="relu"))

    model.add(BatchNormalization())

    model.add(Dense(5, activation="relu"))

    model.add(Dense(1, activation="sigmoid"))

    model.summary()

    model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])

    model.fit(
        x_train,
        y_train,
        batch_size=batch,
        epochs=epochs,
        verbose=1,
        validation_data=(x_test, y_test),
    )

    return model

In [None]:
annModel(x_train_ig, y_train, x_test_ig, y_test, 10, 10)

In [None]:
annModel(x_train_chi, y_train, x_test_chi, y_test, 10, 10)

In [None]:
x_train["male"].shape

In [None]:
ann_model = annModel(x_train, y_train, x_test, y_test, 10, 10)
ann_model

In [None]:
org_test_data.head()

In [None]:
y_pred = ann_model.predict_classes(org_test_data)

In [None]:
y_pred

In [None]:
org_test_pssg_id.head()

In [None]:
submission = pd.read_csv(
    "data/Tabular Playground Series - Apr 2021/sample_submission.csv"
)

In [None]:
submission.head()

In [None]:
export_df = pd.DataFrame()
export_df["PassengerId"] = org_test_pssg_id
export_df["Survived"] = y_pred
export_df.to_csv(
    "data/Tabular Playground Series - Apr 2021/my_submission.csv", index=False
)

In [None]:
export_df