In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# To train a Random Forest Model which predicts titanic survivours

In [None]:
#import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

In [None]:
titanic = pd.read_csv("/kaggle/input/titanic-dataset/Titanic-Dataset.csv")

#drop columns that are not required
titanic.drop(["PassengerId","Name","Ticket"],inplace=True, axis=1)


In [None]:
#convert numeric colums to float32
for col in titanic.select_dtypes(include="number").columns:
    titanic[col] = titanic[col].astype(float)

In [None]:
#add a small number to numeric colums to avoid divide by 0 error
for col in titanic.select_dtypes(include="number").columns:
    if (col != "Survived"):
        titanic[col] = titanic[col]+1e-10

In [None]:
#extract informatoin from dataset
print(titanic.info())
print(titanic.describe())
print(titanic.select_dtypes(include="object").value_counts())
print(titanic["Sex"].value_counts())
print(titanic["Cabin"].value_counts())
print(titanic["Embarked"].value_counts())

In [None]:
#visualize data
titanic.hist(bins=50, figsize=(12,8))
plt.show()

In [None]:
#check for correlations
corr_matrix = titanic.select_dtypes(include="number").corr()
print(corr_matrix["Survived"].sort_values(ascending=False))

In [None]:
from pandas.plotting import scatter_matrix
#create scatter matrix
attributes = ["Survived","Fare","Parch","SibSp", "Age", "Pclass"]
scatter_matrix(titanic[attributes], figsize=(12,8))
plt.show()


# Create stratified train and test set based on Sex and Embarked

In [None]:
from sklearn.model_selection import train_test_split

titanic["temp"]=titanic["Sex"].astype(str) + titanic["Embarked"].astype(str)

#train test split
train_set, test_set = train_test_split(titanic, test_size=0.1,stratify=titanic["temp"], random_state=42)

print(train_set.shape)
print(test_set.shape)

train_set.drop("temp", axis=1, inplace=True)
test_set.drop("temp", axis=1, inplace=True)
titanic.drop("temp", axis=1, inplace=True)

In [None]:
#divide train and test sets into inputs and labels
train_inputs = train_set.drop("Survived", axis=1)
train_labels = train_set["Survived"].copy()

test_inputs = test_set.drop("Survived", axis=1)
test_labels = test_set["Survived"].copy()


In [None]:
#Write custom class to detect Cluster Similarity
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, n_init=10,  gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
        self.n_init = n_init
    
    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state, n_init=self.n_init)
        self.kmeans_.fit(X,sample_weight=sample_weight)
        return self #always return self
    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity"for i in range(self.n_clusters)]

In [None]:
#custom functions for ratio pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.compose import make_column_selector

def column_ratio(X):
    return X[:,[0]]/X[:,[1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"] #feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out = ratio_name),
        StandardScaler())

#log pipeline 

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out = "one-to-one"),
    StandardScaler())

#cluster_simil
cluster_simil = ClusterSimilarity(n_clusters=10, n_init=10, gamma=1., random_state=42)

default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessing_complex = ColumnTransformer([
    ("FareByPclass", ratio_pipeline(), ["Fare", "Pclass"]),
    ("AgeBySibSp", ratio_pipeline(), ["Age","SibSp"]),
    ("AgeByParch", ratio_pipeline(), ["Age","Parch"]),
    ("log", log_pipeline, ["Age", "Fare"]),
    ("geo", cluster_simil, ["Fare"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    
],
remainder = default_num_pipeline)

# Lets Transform Data

In [None]:
#prepare trining data
train_inputs_processed = preprocessing_complex.fit_transform(train_inputs)

print(train_inputs_processed.shape)
preprocessing_complex.get_feature_names_out()

In [None]:
#using random search 
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

full_pipeline = Pipeline([
    ("preprocessing", preprocessing_complex),
    ("random_forest", RandomForestClassifier(random_state=42)),
])

param_distribs = {'random_forest__max_features': randint(low=2, high=200)}

rnd_search = RandomizedSearchCV(
    full_pipeline, param_distributions = param_distribs, n_iter = 50, cv=3,
    scoring = "accuracy", random_state=42)

rnd_search.fit(train_inputs, train_labels)

In [None]:
#get best model from random serarch
final_model = rnd_search.best_estimator_

#get important features
feature_importance = final_model["random_forest"].feature_importances_
feature_names = final_model["preprocessing"].get_feature_names_out()
print(sorted(zip(feature_importance, feature_names),reverse=True))



In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

final_predictions = final_model.predict(test_inputs)

final_accuracy = accuracy_score(test_labels, final_predictions)
final_f1_score = f1_score(test_labels, final_predictions)
cm = confusion_matrix(test_labels, final_predictions)

print("Final Accuracy is %.2f" %(final_accuracy))
print("Final f1_score is %.2f" %(final_f1_score))
print("Final Confusion Matrix is")
print(cm)


print("")
print("Final Predictions are")
print(final_predictions)

