In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import randint
from random import randrange, uniform

import warnings
warnings.filterwarnings('ignore')

In [49]:
from sklearn import set_config

set_config(display='diagram')

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [7]:
df = pd.read_csv('train.csv')

In [8]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [9]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [10]:
df.duplicated().sum()

0

In [11]:
class TotalFamily(BaseEstimator, TransformerMixin):
    
    def __init__(self) -> None:
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        
        # Calculate total family members and create a new column for it
        total_family_members = X[:, 5] + X[:, 6] + 1
        X = np.column_stack((X, total_family_members))
        X = np.delete(X, [5, 6], axis=1)
        return X
    


class IndividualFare(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X = X.copy()
        
        individual_fare = X[:, 5] / X[:, 6]
        
        X = np.column_stack((X, individual_fare))
        X = np.delete(X, [5], axis=1)
        
        return X

In [12]:
drop_cols = [2, 7, 9]
impute_col = [3]
impute_emb = [7]
encode_col = [0, 4]

In [13]:
drop_column_transformer = ColumnTransformer(transformers=[('drop_col', 'drop', drop_cols)], remainder='passthrough')
impute_column_transformer = ColumnTransformer(transformers=[('imputer', SimpleImputer(strategy='mean'), impute_col)], remainder='passthrough')
impute_emp_column_transformer = ColumnTransformer(transformers=[('imputer', SimpleImputer(strategy='most_frequent'), impute_emb)], remainder='passthrough')
encoding_column_transformer = ColumnTransformer(transformers=[('encode', OneHotEncoder(sparse_output=False, drop='first'), encode_col)], remainder='passthrough')

In [14]:
pipeline = Pipeline([
    ('drop', drop_column_transformer),
    ('impute_age', impute_column_transformer),
    ('impute_emb', impute_emp_column_transformer),
    ('family', TotalFamily()),
    ('fare', IndividualFare()),
    ('encode', encoding_column_transformer)
])

In [15]:
X,y = df.drop(columns='Survived'), df.Survived

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Logistic Regression

In [17]:
logistic_pipeline = Pipeline([
    ('pipeline', pipeline),
    ('logistic', LogisticRegression(n_jobs=-1))
])

In [18]:
logistic_param_list = {
    'logistic': [LogisticRegression()],
    'logistic__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'logistic__C': range(0, 13),
    'logistic__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky']
}

In [19]:
logistic_grid_search = GridSearchCV(estimator=logistic_pipeline, param_grid=logistic_param_list, cv=5)

In [20]:
logistic_model = logistic_grid_search.fit(X_train, y_train)

In [53]:
logistic_model

In [21]:
logistic_model.best_score_

0.7991825076332117

In [22]:
logistic_model.best_params_

{'logistic': LogisticRegression(),
 'logistic__C': 1,
 'logistic__penalty': 'l1',
 'logistic__solver': 'liblinear'}

### Gradient Bosster Classfier

In [23]:
gradient_pipeline = Pipeline([
    ('pipeline', pipeline),
    ('gradient', GradientBoostingClassifier())
])

In [42]:
random_param_dist = {
    'gradient__n_estimators': [50, 100, 200, 300, 400, 500],  # Example values, adjust as needed
    'gradient__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'gradient__max_depth': [3, 4, 5, 6, 7, 8, 9],  # Example values, adjust as needed
    'gradient__min_samples_split': [2, 3, 4, 5, 6, 7],  # Example values, adjust as needed
    'gradient__min_samples_leaf': [1, 2, 3, 4, 5, 6, 7]  # Example values, adjust as needed
}

In [43]:
gradient_random_search = RandomizedSearchCV(estimator=gradient_pipeline, param_distributions=random_param_dist, cv=5, verbose=0)

In [44]:
random_model = gradient_random_search.fit(X_train, y_train)

In [52]:
random_model

In [45]:
random_model.best_score_

0.8131685216192259

In [46]:
random_model.best_params_

{'gradient__n_estimators': 50,
 'gradient__min_samples_split': 4,
 'gradient__min_samples_leaf': 3,
 'gradient__max_depth': 4,
 'gradient__learning_rate': 0.1}