In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold

In [2]:
# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
# test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)

# train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
# test_df = test_df.drop(['Name'], axis=1)

combine = [train_df, test_df]

In [3]:
# Enhanced feature engineering
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    # Create family size feature
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    # Create is_alone feature
    dataset['IsAlone'] = 1
    dataset.loc[dataset['FamilySize'] > 1, 'IsAlone'] = 0
    
    # Extract title from name
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Group rare titles
    rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    dataset['Title'] = dataset['Title'].replace(rare_titles, 'Rare')
    dataset['Title'] = dataset['Title'].replace(['Mlle', 'Ms'], 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Extract ticket prefix and ticket length
    dataset['TicketPrefix'] = dataset['Ticket'].str.extract('([A-Za-z]+)', expand=False)
    dataset['TicketPrefix'] = dataset['TicketPrefix'].fillna('XXX')
    dataset['TicketLen'] = dataset['Ticket'].str.len()

    dataset['Age'] = pd.to_numeric(dataset['Age'], errors='coerce')
    dataset['Age'] = dataset['Age'].fillna(dataset['Age'].median())
    
    # Create age bands
    dataset['AgeBand'] = pd.cut(dataset['Age'], bins=[0, 16, 32, 48, 64, np.inf], labels=[0, 1, 2, 3, 4])
    dataset['FamilySizeCategory'] = pd.cut(dataset['FamilySize'], bins=[0, 1, 4, np.inf], labels=['Small', 'Medium', 'Large'])
    dataset['FamilySizeCategory'] = pd.Categorical(dataset['FamilySizeCategory']).codes

    
    # Create fare bands
    dataset['FareBand'] = pd.qcut(dataset['Fare'], 4)

In [4]:
# Advanced imputation
imputer = KNNImputer(n_neighbors=5)
train_df[['Age', 'Fare']] = imputer.fit_transform(train_df[['Age', 'Fare']])
test_df[['Age', 'Fare']] = imputer.transform(test_df[['Age', 'Fare']])

In [5]:
# Feature scaling
scaler = StandardScaler()
numeric_features = ['Age', 'Fare', 'FamilySize', 'TicketLen']
train_df[numeric_features] = scaler.fit_transform(train_df[numeric_features])
test_df[numeric_features] = scaler.transform(test_df[numeric_features])

In [6]:
# Encode categorical variables
categorical_features = ['Sex', 'Embarked', 'Title', 'TicketPrefix', 'AgeBand', 'FareBand']
for feature in categorical_features:
    train_df[feature] = pd.Categorical(train_df[feature]).codes
    test_df[feature] = pd.Categorical(test_df[feature]).codes

In [7]:
# for dataset in combine:
#     dataset['Age*Class'] = dataset['Age'] * dataset['Pclass']
#     dataset['Fare*Class'] = dataset['Fare'] * dataset['Pclass']
#     dataset['Age*Title'] = dataset['Age'] * dataset['Title']

In [8]:
# Feature selection
selector = SelectKBest(f_classif, k=11)
X = train_df.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
y = train_df['Survived']
X_selected = selector.fit_transform(X, y)

In [9]:
X_train = train_df.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 15), (891,), (418, 15))

In [10]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


81.82

In [11]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

81.26

In [12]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

87.54

In [13]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

78.79

In [14]:
perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

78.68

In [15]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc



81.14

In [16]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

66.78

In [17]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

98.65

In [18]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

98.65

In [19]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,98.65
8,Decision Tree,98.65
1,KNN,87.54
2,Logistic Regression,81.82
0,Support Vector Machines,81.26
7,Linear SVC,81.14
4,Naive Bayes,78.79
5,Perceptron,78.68
6,Stochastic Gradient Decent,66.78


In [20]:
# Create a pipeline with Random Forest
pipelines = {
    'Support Vector Machines': Pipeline([('classifier', SVC(random_state=42))]),
    'KNN': Pipeline([('classifier', KNeighborsClassifier())]),
    'Logistic Regression': Pipeline([('classifier', LogisticRegression(random_state=42))]),
    'Random Forest': Pipeline([('classifier', RandomForestClassifier(n_estimators=100, random_state=42))]),
    'Naive Bayes': Pipeline([('classifier', GaussianNB())]),
    'Perceptron': Pipeline([('classifier', Perceptron(random_state=42))]),
    'Stochastic Gradient Descent': Pipeline([('classifier', SGDClassifier(random_state=42))]),
    'Linear SVC': Pipeline([('classifier', LinearSVC(random_state=42))]),
    'Decision Tree': Pipeline([('classifier', DecisionTreeClassifier(random_state=42))])
}

In [21]:
# Define parameter grid for GridSearchCV
param_grid = {
    'Random Forest': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [5, 10, None],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    },
    'XGBoost': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    },
    'LightGBM': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    }
}

for model_name, pipeline in pipelines.items():
    # Perform GridSearchCV
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    if model_name in param_grid:
        grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=cv, scoring='accuracy', n_jobs=-1)
    else:
        grid_search = GridSearchCV(pipeline, {}, cv=cv, scoring='accuracy', n_jobs=-1)
    
    grid_search.fit(X_selected, y)
    
    # Get best model and its score
    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    
    # print(f"{model_name} Accuracy: {best_score:.4f}")
    # print(f"Best Parameters: {grid_search.best_params_}")



In [22]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
from scipy.stats import randint, uniform

# Remove the 'classifier__' prefix from the best parameters
rf_params = {k.replace('classifier__', ''): v for k, v in grid_search.best_params_.items()}

# Define parameter distributions for RandomizedSearchCV
rf_param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None] + list(range(5, 30)),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10)
}

xgb_param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4)
}

lgb_param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'num_leaves': randint(20, 100),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4)
}

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform RandomizedSearchCV for each classifier
rf_random = RandomizedSearchCV(RandomForestClassifier(**rf_params), param_distributions=rf_param_dist, n_iter=50, cv=cv, random_state=42, n_jobs=-1)
xgb_random = RandomizedSearchCV(XGBClassifier(), param_distributions=xgb_param_dist, n_iter=50, cv=cv, random_state=42, n_jobs=-1)
lgb_random = RandomizedSearchCV(LGBMClassifier(verbose=-1), param_distributions=lgb_param_dist, n_iter=50, cv=cv, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV
rf_random.fit(X_selected, y)
xgb_random.fit(X_selected, y)
lgb_random.fit(X_selected, y)

# Create ensemble with tuned models
ensemble = VotingClassifier(
    estimators=[
        ('rf', rf_random.best_estimator_),
        ('xgb', xgb_random.best_estimator_),
        ('lgb', lgb_random.best_estimator_)
    ],
    voting='soft'
)

# Evaluate ensemble
ensemble_scores = cross_val_score(ensemble, X_selected, y, cv=cv, scoring='accuracy')
print(f"Tuned Ensemble Model Average Accuracy: {ensemble_scores.mean():.4f}")

# Print best parameters for each model
print("Best RandomForest Parameters:", rf_random.best_params_)
print("Best XGBoost Parameters:", xgb_random.best_params_)
print("Best LightGBM Parameters:", lgb_random.best_params_)


Tuned Ensemble Model Average Accuracy: 0.8541
Best RandomForest Parameters: {'max_depth': 8, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 104}
Best XGBoost Parameters: {'colsample_bytree': 0.6624074561769746, 'learning_rate': 0.05679835610086079, 'max_depth': 5, 'n_estimators': 187, 'subsample': 0.7334834444556088}
Best LightGBM Parameters: {'colsample_bytree': 0.7394663949166917, 'learning_rate': 0.03885296532742623, 'max_depth': 3, 'n_estimators': 227, 'num_leaves': 58, 'subsample': 0.9746919954946938}
