# My Playground

In [122]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import re
import math
import pickle
from datetime import datetime
import time
import warnings
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
start_time = time.time()

In [123]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, learning_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, f1_score, precision_score, plot_roc_curve
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression 
from xgboost import XGBClassifier 
from lightgbm import LGBMClassifier

In [124]:
df_train2 = pd.read_csv('data/train.csv')
df_test2 = pd.read_csv('data/test.csv')
df = pd.concat([df_train2, df_test2], axis=0)

# Preprocessing

In [125]:
df.sample()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
64,956,,1,"Ryerson, Master. John Borie",male,13.0,2,2,PC 17608,262.375,B57 B59 B63 B66,C


In [126]:
title_mapping = {
    'Mr.': 'Mr.',
    'Mrs.': 'Mrs.',
    'Miss.': 'Miss',
    'Ms.': 'Miss',
    'Master.': 'Master.',
    'Mlle.': 'Miss',
    'Mme.': 'Miss',
    'Dr.': 'Special_title',
    'Sir': 'Special_title',
    'Col.': 'Special_title',
    'Capt.': 'Special_title',
    'Don.': 'Special_title',
    'Major.': 'Special_title',
    'Jonkheer.': 'Special_title',
    'Rev.': 'Special_title',
    'Countess.': 'Special_title',
    'Lady.': 'Special_title',
}
df['Title'] = df['Name'].apply(lambda name: next((title_mapping[title] for title in title_mapping if title in name), None))

In [127]:
median_age_titles = df.groupby('Title')['Age'].transform('median').round(0)
df['Age'] = df['Age'].fillna(median_age_titles)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df['Fare'].fillna(8.0, inplace=True)
df['Title'].fillna(df['Title'].mode()[0], inplace=True)

In [128]:
df.drop(columns=['PassengerId', 'Ticket', 'SibSp', 'Parch', 'Cabin', 'Name', 'Age'], inplace=True)

**==================================================================================================================**

In [129]:
idx_split = len(df_train2)

df_train = df.iloc[:idx_split]
df_test = df.iloc[idx_split:].drop(columns=['Survived'])

X = df_train.drop(columns=['Survived'])
y = df_train['Survived']
X_validation = df_test.copy()

print(X.shape)
print(y.shape)
print(X_validation.shape)

(891, 5)
(891,)
(418, 5)


In [130]:
X.sample()

Unnamed: 0,Pclass,Sex,Fare,Embarked,Title
713,3,male,9.4833,S,Mr.


In [131]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [132]:
algorithms_scaled_data = [
    LogisticRegression(),
    SVC(random_state=42),
    #KNeighborsClassifier()
]

algorithms_unscaled_data = [
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    XGBClassifier(random_state=42),
    #GradientBoostingClassifier(random_state=42),
    #AdaBoostClassifier(random_state=42)
]

algorithms = algorithms_scaled_data + algorithms_unscaled_data

In [133]:
pipelines = {}
base_models = []
num_features = X.select_dtypes(include=['number']).columns
cat_features = X.select_dtypes(include=['object', 'category']).columns

for algorithm in algorithms:
    
    # extract algorithm name as a string
    algorithm_name = algorithm.__class__.__name__
    
    # Create a column transformer for handling categorical and numerical features separately 
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_features), 
            ('cat', OneHotEncoder(drop='first', handle_unknown="ignore"), cat_features) 
        ]
    )
    
    # make pipelines for algorithms need scaling and onehotencoding (defined in preprocessor)
    if algorithm in algorithms_scaled_data:
        pipeline = Pipeline(steps = [
                ('preprocessor', preprocessor),
                (algorithm_name, algorithm)
            ])

    # make pipelines for tree based algorithms (no need for scaling and onehot encoding, but only ordinalencoding)
    else:
        pipeline = Pipeline(steps=[
                ("ordinalencoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
                (algorithm_name, algorithm)
            ])
    
    # REUSABLE models
    # fill the dictionary with algo names and their corresponding pipelines for using later
    pipelines[algorithm_name] = pipeline
    base_models.append((algorithm_name, pipeline)) 
    
# For Example
#pipelines['LogisticRegression']

In [134]:
from sklearn.ensemble import StackingClassifier

In [135]:
meta_model = XGBClassifier(random_state=42)
stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_model)

In [136]:
stacked_model.fit(X_train, y_train)

In [137]:
y_pred = stacked_model.predict(X_test)

In [138]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Stacked Ensemble Accuracy: {accuracy}")

Stacked Ensemble Accuracy: 0.8044692737430168


In [139]:
model_scores = pd.DataFrame(columns=['Algorithm', 'Train_Accuracy', 'Test_Accuracy', 'Cross_Validate_Accuracy'])
algorithm_score = {}

# iterate through our stored pipelines in pipeline dictionary
for model_name, pipeline in pipelines.items():
    
    # fit the pipelines
    pipeline.fit(X_train, y_train)

    # make predictions
    y_pred = pipeline.predict(X_test)
    y_train_pred = pipeline.predict(X_train)
    
    # calculate the scores
    accuracy_test = accuracy_score(y_test, y_pred)
    accuracy_train = accuracy_score(y_train, y_train_pred)
    cv_score = cross_validate(pipeline, X_train, y_train, scoring='accuracy')['test_score'].mean()  
    
    # fill the scores dictionary with scores
    algorithm_score['Algorithm'] = model_name
    algorithm_score['Train_Accuracy'] = accuracy_train
    algorithm_score['Test_Accuracy'] = accuracy_test
    algorithm_score['Cross_Validate_Accuracy'] = cv_score
    
    # convert scores dictionary to dataframe
    model_scores = model_scores.append(algorithm_score, ignore_index=True)
    model_scores = model_scores.sort_values(by='Cross_Validate_Accuracy', ascending=False)
    
model_scores

Unnamed: 0,Algorithm,Train_Accuracy,Test_Accuracy,Cross_Validate_Accuracy
4,XGBClassifier,0.91573,0.832402,0.827312
0,SVC,0.841292,0.810056,0.818822
1,RandomForestClassifier,0.921348,0.815642,0.817483
2,DecisionTreeClassifier,0.921348,0.837989,0.813247
3,LogisticRegression,0.804775,0.793296,0.79348


In [140]:
XGBClassifier()

In [141]:
xgb_model = pipelines['XGBClassifier']

xgb_param_grid = {
    'XGBClassifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'XGBClassifier__n_estimators': [50, 100, 200],
    'XGBClassifier__max_depth': [3, 5, 7, 8, 9],
    'XGBClassifier__subsample': [0.8, 0.9, 1.0],
    'XGBClassifier__colsample_bytree': [0.8, 0.9, 1.0, 1.2],
}

xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [142]:
xgb_grid_search.best_score_

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

In [None]:
xgb_grid_search.best_params_

In [None]:
best_model = xgb_grid_search.best_estimator_
best_model

In [None]:
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on Test Set:", accuracy)