In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

combine = [train_df, test_df]

# Enhanced feature engineering
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    # Create family size feature
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    # Create is_alone feature
    dataset['IsAlone'] = 1
    dataset.loc[dataset['FamilySize'] > 1, 'IsAlone'] = 0
    
    # Extract title from name
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Group rare titles
    rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    dataset['Title'] = dataset['Title'].replace(rare_titles, 'Rare')
    dataset['Title'] = dataset['Title'].replace(['Mlle', 'Ms'], 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Extract ticket prefix and ticket length
    dataset['TicketPrefix'] = dataset['Ticket'].str.extract('([A-Za-z]+)', expand=False)
    dataset['TicketPrefix'] = dataset['TicketPrefix'].fillna('XXX')
    dataset['TicketLen'] = dataset['Ticket'].str.len()

    dataset['Age'] = pd.to_numeric(dataset['Age'], errors='coerce')
    dataset['Age'] = dataset['Age'].fillna(dataset['Age'].median())
    
    # Create age bands
    dataset['AgeBand'] = pd.cut(dataset['Age'], bins=[0, 16, 32, 48, 64, np.inf], labels=[0, 1, 2, 3, 4])
    dataset['FamilySizeCategory'] = pd.cut(dataset['FamilySize'], bins=[0, 1, 4, np.inf], labels=['Small', 'Medium', 'Large'])
    dataset['FamilySizeCategory'] = pd.Categorical(dataset['FamilySizeCategory']).codes
    
    # Create fare bands
    dataset['FareBand'] = pd.qcut(dataset['Fare'], 4)
    
    # Create fare per person feature
    dataset['FarePerPerson'] = dataset['Fare'] / dataset['FamilySize']
    
    # Extract deck information from Cabin
    dataset['Deck'] = dataset['Cabin'].str[0]
    dataset['Deck'] = dataset['Deck'].fillna('Unknown')

# Advanced imputation
imputer = KNNImputer(n_neighbors=5)
train_df[['Age', 'Fare']] = imputer.fit_transform(train_df[['Age', 'Fare']])
test_df[['Age', 'Fare']] = imputer.transform(test_df[['Age', 'Fare']])

# Feature scaling
scaler = StandardScaler()
numeric_features = ['Age', 'Fare', 'FamilySize', 'TicketLen', 'FarePerPerson']
train_df[numeric_features] = scaler.fit_transform(train_df[numeric_features])
test_df[numeric_features] = scaler.transform(test_df[numeric_features])

# Encode categorical variables
categorical_features = ['Sex', 'Embarked', 'Title', 'TicketPrefix', 'AgeBand', 'FareBand', 'Deck']
for feature in categorical_features:
    train_df[feature] = pd.Categorical(train_df[feature]).codes
    test_df[feature] = pd.Categorical(test_df[feature]).codes

# Feature selection
selector = SelectKBest(f_classif, k=15)
X = train_df.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
y = train_df['Survived']
X_selected = selector.fit_transform(X, y)

# Create a pipeline with Random Forest
pipelines = {
    'Support Vector Machines': Pipeline([('classifier', SVC(random_state=42))]),
    'KNN': Pipeline([('classifier', KNeighborsClassifier())]),
    'Logistic Regression': Pipeline([('classifier', LogisticRegression(random_state=42))]),
    'Random Forest': Pipeline([('classifier', RandomForestClassifier(n_estimators=100, random_state=42))]),
    'Naive Bayes': Pipeline([('classifier', GaussianNB())]),
    'Perceptron': Pipeline([('classifier', Perceptron(random_state=42))]),
    'Stochastic Gradient Descent': Pipeline([('classifier', SGDClassifier(random_state=42))]),
    'Linear SVC': Pipeline([('classifier', LinearSVC(random_state=42))]),
    'Decision Tree': Pipeline([('classifier', DecisionTreeClassifier(random_state=42))]),
    'XGBoost': Pipeline([('classifier', XGBClassifier(random_state=42))]),
    'LightGBM': Pipeline([('classifier', LGBMClassifier(random_state=42))]),
    'CatBoost': Pipeline([('classifier', CatBoostClassifier(random_state=42, verbose=0))])
}

# Define parameter grid for GridSearchCV
param_grid = {
    'Random Forest': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [5, 10, None],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    },
    'XGBoost': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    },
    'LightGBM': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    }
}

for model_name, pipeline in pipelines.items():
    # Perform GridSearchCV
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    if model_name in param_grid:
        grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=cv, scoring='accuracy', n_jobs=-1)
    else:
        grid_search = GridSearchCV(pipeline, {}, cv=cv, scoring='accuracy', n_jobs=-1)
    
    grid_search.fit(X_selected, y)
    
    # Get best model and its score
    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    
    print(f"{model_name} Accuracy: {best_score:.4f}")
    # print(f"Best Parameters: {grid_search.best_params_}")

# Ensemble model with best models
rf_model = RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, min_samples_leaf=2, random_state=42)
xgb_model = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
lgb_model = LGBMClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)

ensemble = VotingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgb_model), ('lgb', lgb_model)],
    voting='soft'
)

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

ensemble_scores = cross_val_score(ensemble, X_selected, y, cv=cv, scoring='accuracy')
print(f"Ensemble Model Average Accuracy: {ensemble_scores.mean():.4f}")

Support Vector Machines Accuracy: 0.8159
KNN Accuracy: 0.8282
Logistic Regression Accuracy: 0.8092


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Forest Accuracy: 0.8428
Naive Bayes Accuracy: 0.7542
Perceptron Accuracy: 0.6788
Stochastic Gradient Descent Accuracy: 0.7721




Linear SVC Accuracy: 0.8159
Decision Tree Accuracy: 0.7767
XGBoost Accuracy: 0.8406
[LightGBM] [Info] Number of positive: 308, number of negative: 494
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000768 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 388
[LightGBM] [Info] Number of data points in the train set: 802, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384040 -> initscore=-0.472436
[LightGBM] [Info] Start training from score -0.472436
[LightGBM] [Info] Number of positive: 308, number of negative: 494
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001606 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.[LightGBM] [Info] Number of positive: 308, number of negative: 494
[Light