In [192]:
# Imports
import os
import warnings
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn import (
    ensemble, gaussian_process, linear_model, naive_bayes,
    neighbors, svm, tree, discriminant_analysis
)
from xgboost import XGBClassifier

# Suppress warnings
warnings.filterwarnings("ignore")

In [193]:
# Suppress warnings
warnings.filterwarnings("ignore")

# Load data
for dirname, _, filenames in os.walk('input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train = pd.read_csv('input/train_cleaned.csv')
#test = pd.read_csv('input/test_cleaned.csv')
#train = pd.read_csv('input/combined_cleaned.csv')

input/test.csv
input/train_cleaned.csv
input/test_cleaned.csv
input/titanic_data.csv
input/combined_cleaned.csv
input/train.csv


In [194]:
train.sample(5)

Unnamed: 0,Age,Embarked,Fare,PassengerId,Pclass,Sex,Survived,Title,Family_size
259,50,S,26.0,260,2,female,1,Mrs,Small
109,22,Q,24.15,110,3,female,1,Miss,Small
164,1,S,39.6875,165,3,male,0,Master,Large
18,31,S,18.0,19,3,female,0,Mrs,Small
562,28,S,13.5,563,2,male,0,Mr,Alone


In [196]:
print(train.dtypes)

Age              int64
Embarked        object
Fare           float64
PassengerId      int64
Pclass           int64
Sex             object
Survived         int64
Title           object
Family_size     object
dtype: object


In [126]:
df.sample(5)

Unnamed: 0,Age,Embarked,Fare,PassengerId,Pclass,Sex,Survived,Title,Family_size
965,35,C,211.5,966,1,female,0,Miss,Alone
709,25,C,15.2458,710,3,male,1,Master,Small
1293,22,C,59.4,1294,1,female,0,Miss,Small
1298,50,C,211.5,1299,1,male,0,Mr,Small
1108,57,S,164.8667,1109,1,male,0,Mr,Small


In [127]:
# Prepare target and drop unnecessary columns
train['Survived'] = train['Survived'].astype('int64')
train.drop("PassengerId", axis=1, inplace=True)

In [128]:
# Split features and target
X = train.drop("Survived", axis=1)
y = train["Survived"]

In [129]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [130]:
# Data Transformation Pipelines
num_cat_transformation = ColumnTransformer([
    ('scaling', MinMaxScaler(), [0, 2]),
    ('onehot_encoding1', OneHotEncoder(), [1, 3]),
    ('ordinal_encoding', OrdinalEncoder(), [4]),
    ('onehot_encoding2', OneHotEncoder(), [5, 6])
], remainder='passthrough')

In [131]:
bins = ColumnTransformer([
    ('Kbins', KBinsDiscretizer(n_bins=15, encode='ordinal', strategy='quantile'), [0, 2]),
], remainder='passthrough')

In [132]:
# Function to create a pipeline for each model
def create_pipeline(algo):
    return Pipeline([
        ('num_cat_transformation', num_cat_transformation),
        ('bins', bins),
        ('classifier', algo)
    ])

In [133]:
# List of classification algorithms to evaluate
algorithms = [
    # Ensemble
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    # Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),

    # GLMs
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    # Naive Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),

    # Nearest Neighbor
    neighbors.KNeighborsClassifier(),

    # SVMs
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),

    # Trees
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),

    # Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),

    # XGBoost
    XGBClassifier()
]

In [134]:
# Evaluate all models with cross-validation
model_names = []
cv_accuracies = []

for algo in algorithms:
    pipeline = create_pipeline(algo)
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)
    model_names.append(algo.__class__.__name__)
    cv_accuracies.append(scores.mean())

# Show cross-validation results
model_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': cv_accuracies
}).sort_values(by='Accuracy', ascending=False)

print(model_df)

                            Model  Accuracy
8               RidgeClassifierCV  0.828622
3      GradientBoostingClassifier  0.828602
19     LinearDiscriminantAnalysis  0.827214
6            LogisticRegressionCV  0.822988
5       GaussianProcessClassifier  0.822978
16                      LinearSVC  0.821600
0              AdaBoostClassifier  0.820221
15                          NuSVC  0.808914
13           KNeighborsClassifier  0.807515
21                  XGBClassifier  0.804806
14                            SVC  0.804718
4          RandomForestClassifier  0.800552
11                    BernoulliNB  0.799094
1               BaggingClassifier  0.796375
17         DecisionTreeClassifier  0.779474
2            ExtraTreesClassifier  0.773899
10                     Perceptron  0.768157
18            ExtraTreeClassifier  0.766946
12                     GaussianNB  0.766768
9                   SGDClassifier  0.713671
7     PassiveAggressiveClassifier  0.711849
20  QuadraticDiscriminantAnalysi

In [None]:
algo1 = ensemble.RandomForestClassifier(
     criterion='gini',
     n_estimators=1750,
     max_depth=7,
     min_samples_split=6,
     min_samples_leaf=6,
     max_features='sqrt',
     oob_score=True,
     random_state=42,
     n_jobs=-1,
     verbose=1
 )

algo2 = ensemble.GradientBoostingClassifier(
     max_depth=1,
     max_features='sqrt',
     n_estimators=3,
     random_state=42,
     warm_start=True
 )

lda = discriminant_analysis.LinearDiscriminantAnalysis()

xgb_model = XGBClassifier()

In [108]:
#Fit and evaluate XGBoost pipeline
xgb_pipeline = Pipeline([
     ('num_cat_transformation', num_cat_transformation),
     ('bins', bins),

     ('classifier', lda)
     ])

In [109]:
xgb_pipeline.fit(X_train, y_train)
y_pred = xgb_pipeline.predict(X_test)
print("XGBoost Test Accuracy:", accuracy_score(y_test, y_pred))

XGBoost Test Accuracy: 0.7621951219512195


In [160]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[
        ('ridge', linear_model.RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), cv=5)),
        ('lda', discriminant_analysis.LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')),
        ('gb', ensemble.GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, max_depth=4, random_state=42)),
    ],
    voting='hard'
)

ensemble_pipeline = Pipeline([
    ('num_cat_transformation', num_cat_transformation),
    ('bins', bins),
    ('classifier', voting_clf)
])

ensemble_pipeline.fit(X_train, y_train)
y_pred_ensemble = ensemble_pipeline.predict(X_test)
print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred_ensemble))


Voting Classifier Accuracy: 0.8212290502793296


In [161]:
import joblib

joblib.dump(ensemble_pipeline, 'model.joblib')

['model.joblib']

In [197]:
import pandas as pd

custom_input = pd.DataFrame([{
    'Age': 25,
    'Sex': 'male',
    'Fare': 50,
    'Embarked': 'C',
    'Pclass': 1,
    'Title': 'Mr',
    'Family_size': 'Small',
}])


In [198]:
y_pred = grid.best_estimator_.predict(custom_input)
print("Prediction for custom input:", y_pred)


Prediction for custom input: [0]


Prediction for custom input: [0]
