In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasClassifier

In [19]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [24]:
spaceship_data = pd.read_csv('train.csv')
spaceship_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [25]:
def check_missing_values(data):
    """
    Check for missing values in the dataset.
    """
    missing_values = data.isnull().sum()
    return missing_values[missing_values > 0]

def fill_missing_values(data):
    """
    Fill missing values in the dataset.
    - Drop rows with missing values in critical columns.
    - Fill categorical columns with the mode.
    - Fill numerical columns with the median.
    """
    # Drop rows with missing values in critical columns
    # data = data.dropna(subset=['HomePlanet', 'CryoSleep', 'Cabin', 'Destination'])

    # Fill missing categorical values with the mode
    for column in ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']:
        data[column].fillna(data[column].mode()[0], inplace=True)

    # Fill missing numerical values with the median
    for column in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
        data[column].fillna(data[column].median(), inplace=True)

    return data

def feature_engineering(data):
    """
    Perform feature engineering on the dataset.
    - Split 'Cabin' into 'Deck', 'CabinNumber', and 'Side'.
    - Create 'TotalSpend' feature.
    - Create 'AgeGroup' feature.
    """
    # Split 'Cabin' into 'Deck', 'CabinNumber', and 'Side'
    data[['Deck', 'CabinNumber', 'Side']] = data['Cabin'].astype(str).str.split('/', expand=True)
    data['CabinNumber'] = pd.to_numeric(data['CabinNumber'], errors='coerce')
    data = data.drop(columns=['Cabin'])

    # Create 'TotalSpend' feature
    data['TotalSpend'] = (
        data['RoomService'] +
        data['FoodCourt'] +
        data['ShoppingMall'] +
        data['Spa'] +
        data['VRDeck']
    )

    # Create age groups
    bins = [0, 12, 18, 35, 60, 100]
    labels = ['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior']
    data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels)

    return data

def encode_categorical_features(data):
    """
    Encode categorical features using LabelEncoder.
    """
    label_encoder = LabelEncoder()
    for column in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Name', 'Deck', 'Side', 'AgeGroup']:
        data[column] = label_encoder.fit_transform(data[column].astype(str))
    
    return data

def check_target_balance(data, target_column):
    """
    Check the balance of the target variable.
    """
    target_balance = data[target_column].value_counts(normalize=True)
    return target_balance

def preprocess_data(data, target_column='Transported'):
    """
    Perform the full preprocessing pipeline on the dataset.
    """
    # Step 1: Check and fill missing values
    data = fill_missing_values(data)

    # Step 2: Perform feature engineering
    data = feature_engineering(data)

    # Step 3: Encode categorical variables
    data = encode_categorical_features(data)

    # Step 4: Check target variable balance
    target_balance = check_target_balance(data, target_column)
    print("Target Balance:\n", target_balance)

    return data

def prediction_results(model, data_path='test.csv', output_path='predictions.csv'):
    """
    Process the dataset, make predictions using the provided model, and save the results to a CSV file.
    
    Parameters:
    - data_path: Path to the dataset to be processed.
    - model: Trained machine learning model to use for predictions.
    - output_path: Path to save the prediction results.
    """
    # Load the dataset
    data = pd.read_csv(data_path)
    data['Transported'] = False
    data = preprocess_data(data)
    passenger_ids = data['PassengerId']
    data = data.drop(columns=['PassengerId', 'Transported'])
    predictions = model.predict(data)
    
    # Ensure predictions are a 2D array
    if predictions.ndim == 1:
        predictions = predictions.reshape(-1, 1)
    
    # Convert float predictions to 'True' or 'False' if needed
    if predictions.dtype == np.float32:
        predictions = np.where(predictions > 0.5, 'True', 'False')
    # Create a DataFrame with PassengerId and Transported predictions
    output = pd.DataFrame({'PassengerId': passenger_ids, 'Transported': predictions.flatten()})
    output.to_csv(output_path, index=False)
    
    return output

check_missing_values(spaceship_data)
spaceship_data = preprocess_data(spaceship_data)

# Split the dataset into features and target
X = spaceship_data.drop(columns=['PassengerId', 'Transported'])
y = spaceship_data['Transported']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Target Balance:
 Transported
True     0.503624
False    0.496376
Name: proportion, dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)
  data[column].fillna(data[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace meth

# Models
1. Random Forrest

In [4]:
# Define the parameter grid for RandomForest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit GridSearchCV to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
report_best = classification_report(y_test, y_pred_best)
best_params, accuracy_best, report_best

Fitting 3 folds for each of 324 candidates, totalling 972 fits


324 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
219 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\shami\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\shami\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\shami\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\shami\AppData\Local\Programs\Python\Python312\L

({'max_depth': 20,
  'max_features': 'sqrt',
  'min_samples_leaf': 1,
  'min_samples_split': 5,
  'n_estimators': 200},
 0.7872340425531915,
 '              precision    recall  f1-score   support\n\n       False       0.79      0.78      0.78       861\n        True       0.79      0.79      0.79       878\n\n    accuracy                           0.79      1739\n   macro avg       0.79      0.79      0.79      1739\nweighted avg       0.79      0.79      0.79      1739\n')

### Predictions

In [5]:
prediction_results(model=best_model)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)
  data[column].fillna(data[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace meth

Target Balance:
 Transported
False    1.0
Name: proportion, dtype: float64


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


2. Ensemble methods (Stacking), improved from 80% to 80.36%

In [6]:
# 1. Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
report_gb = classification_report(y_test, y_pred_gb)

# 2. XGBoost
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
report_xgb = classification_report(y_test, y_pred_xgb)

# 3. Stacking Classifier
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss'))
]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=3)
stacking_model.fit(X_train, y_train)
y_pred_stack = stacking_model.predict(X_test)
accuracy_stack = accuracy_score(y_test, y_pred_stack)
report_stack = classification_report(y_test, y_pred_stack)

# Results
print("Gradient Boosting Accuracy:", accuracy_gb)
print(report_gb)
print("XGBoost Accuracy:", accuracy_xgb)
print(report_xgb)
print("Stacking Classifier Accuracy:", accuracy_stack)
print(report_stack)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Gradient Boosting Accuracy: 0.7924094307073031
              precision    recall  f1-score   support

       False       0.82      0.74      0.78       861
        True       0.77      0.84      0.80       878

    accuracy                           0.79      1739
   macro avg       0.80      0.79      0.79      1739
weighted avg       0.80      0.79      0.79      1739

XGBoost Accuracy: 0.7975848188614146
              precision    recall  f1-score   support

       False       0.82      0.75      0.79       861
        True       0.78      0.84      0.81       878

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739

Stacking Classifier Accuracy: 0.7987349051178838
              precision    recall  f1-score   support

       False       0.82      0.76      0.79       861
        True       0.78      0.83      0.81       878

    accuracy                           0.80      

In [7]:
prediction_results(model=stacking_model)

Target Balance:
 Transported
False    1.0
Name: proportion, dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)
  data[column].fillna(data[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace meth

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


4. Gradient boost with grid search

In [8]:
# Define the parameter grid for Gradient Boosting
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}


gb_model = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_gb_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred_gb = best_gb_model.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f"Accuracy: {accuracy_gb}")

# Classification report
report_gb = classification_report(y_test, y_pred_gb)
report_gb

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Accuracy: 0.7981598619896493


'              precision    recall  f1-score   support\n\n       False       0.80      0.78      0.79       861\n        True       0.79      0.81      0.80       878\n\n    accuracy                           0.80      1739\n   macro avg       0.80      0.80      0.80      1739\nweighted avg       0.80      0.80      0.80      1739\n'

In [9]:
prediction_results(model=best_gb_model)

Target Balance:
 Transported
False    1.0
Name: proportion, dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)
  data[column].fillna(data[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace meth

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


### Neural networks

In [6]:
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))
model.add(BatchNormalization())  # Add Batch Normalization for faster convergence
model.add(Dropout(0.4))  # Increased dropout rate to prevent overfitting
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model with a lower learning rate
optimizer = Adam(learning_rate=0.0005)  # Lower learning rate for more fine-tuned updates
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping with increased patience
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

# Train the model
model.fit(X_train, y_train, validation_split=0.2, epochs=200, batch_size=32, callbacks=[early_stopping], verbose=2)

# Make predictions
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Epoch 1/200


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


174/174 - 3s - 17ms/step - accuracy: 0.6700 - loss: 0.6436 - val_accuracy: 0.7872 - val_loss: 0.4874
Epoch 2/200
174/174 - 0s - 2ms/step - accuracy: 0.7322 - loss: 0.5727 - val_accuracy: 0.7973 - val_loss: 0.4731
Epoch 3/200
174/174 - 0s - 2ms/step - accuracy: 0.7480 - loss: 0.5514 - val_accuracy: 0.7973 - val_loss: 0.4757
Epoch 4/200
174/174 - 0s - 3ms/step - accuracy: 0.7570 - loss: 0.5309 - val_accuracy: 0.7958 - val_loss: 0.4825
Epoch 5/200
174/174 - 0s - 2ms/step - accuracy: 0.7643 - loss: 0.5254 - val_accuracy: 0.8081 - val_loss: 0.4636
Epoch 6/200
174/174 - 0s - 3ms/step - accuracy: 0.7625 - loss: 0.5287 - val_accuracy: 0.8066 - val_loss: 0.4725
Epoch 7/200
174/174 - 0s - 2ms/step - accuracy: 0.7665 - loss: 0.5294 - val_accuracy: 0.8001 - val_loss: 0.4765
Epoch 8/200
174/174 - 0s - 2ms/step - accuracy: 0.7694 - loss: 0.5146 - val_accuracy: 0.8037 - val_loss: 0.4665
Epoch 9/200
174/174 - 0s - 3ms/step - accuracy: 0.7712 - loss: 0.5125 - val_accuracy: 0.7980 - val_loss: 0.4727
Epo

In [26]:
prediction_results(model=model)

Target Balance:
 Transported
False    1.0
Name: proportion, dtype: float64
[1m119/134[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 863us/step

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)
  data[column].fillna(data[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace meth

[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 881us/step
float32
[['True']
 ['False']
 ['True']
 ...
 ['True']
 ['True']
 ['True']]


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


5. Stacking with nn

In [36]:
# Define RandomForest parameter grid and model
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
rf_grid_search = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
rf_grid_search.fit(X_train, y_train)
best_rf_model = rf_grid_search.best_estimator_

# Define Gradient Boosting parameter grid and model
gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
gb_model = GradientBoostingClassifier(random_state=42)
gb_grid_search = GridSearchCV(estimator=gb_model, param_grid=gb_param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
gb_grid_search.fit(X_train, y_train)
best_gb_model = gb_grid_search.best_estimator_

# Define the Keras model for Neural Network
def create_nn_model():
    model = Sequential()
    model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    
    optimizer = Adam(learning_rate=0.0005)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap Keras model with KerasClassifier
nn_model = KerasClassifier(build_fn=create_nn_model, epochs=200, batch_size=32, validation_split=0.2, verbose=2, callbacks=[EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)])

# Define Stacking Classifier with the best models
stacking_model = StackingClassifier(
    estimators=[
        ('rf', best_rf_model),
        ('gb', best_gb_model),
        ('nn', nn_model)
    ],
    final_estimator=GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    cv=3
)

# Fit the Stacking model
stacking_model.fit(X_train, y_train)
y_pred_stack = stacking_model.predict(X_test)
accuracy_stack = accuracy_score(y_test, y_pred_stack)
print(f"Stacking Model Accuracy: {accuracy_stack}")

# Classification report
report_stack = classification_report(y_test, y_pred_stack)
print(report_stack)

Fitting 3 folds for each of 324 candidates, totalling 972 fits


324 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
195 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\shami\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\shami\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\shami\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\shami\AppData\Local\Programs\Python\Python312\L

Fitting 3 folds for each of 27 candidates, totalling 81 fits


  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
174/174 - 7s - 43ms/step - accuracy: 0.6852 - loss: 0.6174 - val_accuracy: 0.7965 - val_loss: 0.4835
Epoch 2/200
174/174 - 1s - 4ms/step - accuracy: 0.7322 - loss: 0.5724 - val_accuracy: 0.7879 - val_loss: 0.4851
Epoch 3/200
174/174 - 1s - 4ms/step - accuracy: 0.7465 - loss: 0.5478 - val_accuracy: 0.8030 - val_loss: 0.4771
Epoch 4/200
174/174 - 1s - 4ms/step - accuracy: 0.7494 - loss: 0.5466 - val_accuracy: 0.8081 - val_loss: 0.4596
Epoch 5/200
174/174 - 1s - 4ms/step - accuracy: 0.7656 - loss: 0.5236 - val_accuracy: 0.8095 - val_loss: 0.4659
Epoch 6/200
174/174 - 1s - 4ms/step - accuracy: 0.7631 - loss: 0.5280 - val_accuracy: 0.8023 - val_loss: 0.4691
Epoch 7/200
174/174 - 1s - 4ms/step - accuracy: 0.7701 - loss: 0.5211 - val_accuracy: 0.8023 - val_loss: 0.4652
Epoch 8/200
174/174 - 1s - 4ms/step - accuracy: 0.7713 - loss: 0.5149 - val_accuracy: 0.8059 - val_loss: 0.4668
Epoch 9/200
174/174 - 1s - 3ms/step - accuracy: 0.7719 - loss: 0.5155 - val_accuracy: 0.8066 - val_loss

  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


116/116 - 3s - 24ms/step - accuracy: 0.6591 - loss: 0.6570 - val_accuracy: 0.7791 - val_loss: 0.4965
Epoch 2/200
116/116 - 0s - 3ms/step - accuracy: 0.7128 - loss: 0.5822 - val_accuracy: 0.7877 - val_loss: 0.4865
Epoch 3/200
116/116 - 0s - 3ms/step - accuracy: 0.7322 - loss: 0.5615 - val_accuracy: 0.7877 - val_loss: 0.4895
Epoch 4/200
116/116 - 0s - 3ms/step - accuracy: 0.7546 - loss: 0.5479 - val_accuracy: 0.7909 - val_loss: 0.4808
Epoch 5/200
116/116 - 0s - 3ms/step - accuracy: 0.7522 - loss: 0.5424 - val_accuracy: 0.7931 - val_loss: 0.4809
Epoch 6/200
116/116 - 0s - 3ms/step - accuracy: 0.7530 - loss: 0.5365 - val_accuracy: 0.7942 - val_loss: 0.4908
Epoch 7/200
116/116 - 0s - 3ms/step - accuracy: 0.7594 - loss: 0.5363 - val_accuracy: 0.7802 - val_loss: 0.4926
Epoch 8/200
116/116 - 0s - 3ms/step - accuracy: 0.7700 - loss: 0.5217 - val_accuracy: 0.7834 - val_loss: 0.4952
Epoch 9/200
116/116 - 0s - 3ms/step - accuracy: 0.7716 - loss: 0.5149 - val_accuracy: 0.7909 - val_loss: 0.4866
Epo

  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


116/116 - 2s - 21ms/step - accuracy: 0.6276 - loss: 0.6933 - val_accuracy: 0.7802 - val_loss: 0.5033
Epoch 2/200
116/116 - 0s - 3ms/step - accuracy: 0.7193 - loss: 0.5883 - val_accuracy: 0.7866 - val_loss: 0.4957
Epoch 3/200
116/116 - 0s - 3ms/step - accuracy: 0.7335 - loss: 0.5630 - val_accuracy: 0.7888 - val_loss: 0.4832
Epoch 4/200
116/116 - 0s - 3ms/step - accuracy: 0.7503 - loss: 0.5494 - val_accuracy: 0.7866 - val_loss: 0.4836
Epoch 5/200
116/116 - 0s - 3ms/step - accuracy: 0.7592 - loss: 0.5396 - val_accuracy: 0.7845 - val_loss: 0.4908
Epoch 6/200
116/116 - 0s - 3ms/step - accuracy: 0.7457 - loss: 0.5482 - val_accuracy: 0.7899 - val_loss: 0.4797
Epoch 7/200
116/116 - 0s - 3ms/step - accuracy: 0.7565 - loss: 0.5349 - val_accuracy: 0.7877 - val_loss: 0.4721
Epoch 8/200
116/116 - 0s - 3ms/step - accuracy: 0.7681 - loss: 0.5154 - val_accuracy: 0.7888 - val_loss: 0.4791
Epoch 9/200
116/116 - 0s - 3ms/step - accuracy: 0.7648 - loss: 0.5274 - val_accuracy: 0.7888 - val_loss: 0.4809
Epo

  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


116/116 - 3s - 25ms/step - accuracy: 0.6686 - loss: 0.6418 - val_accuracy: 0.7726 - val_loss: 0.5199
Epoch 2/200
116/116 - 0s - 3ms/step - accuracy: 0.7246 - loss: 0.5757 - val_accuracy: 0.7780 - val_loss: 0.5098
Epoch 3/200
116/116 - 0s - 3ms/step - accuracy: 0.7433 - loss: 0.5503 - val_accuracy: 0.7672 - val_loss: 0.5167
Epoch 4/200
116/116 - 0s - 3ms/step - accuracy: 0.7562 - loss: 0.5314 - val_accuracy: 0.7748 - val_loss: 0.5204
Epoch 5/200
116/116 - 0s - 3ms/step - accuracy: 0.7602 - loss: 0.5359 - val_accuracy: 0.7769 - val_loss: 0.5235
Epoch 6/200
116/116 - 0s - 3ms/step - accuracy: 0.7589 - loss: 0.5305 - val_accuracy: 0.7791 - val_loss: 0.5167
Epoch 7/200
116/116 - 0s - 3ms/step - accuracy: 0.7683 - loss: 0.5184 - val_accuracy: 0.7834 - val_loss: 0.5189
Epoch 8/200
116/116 - 0s - 3ms/step - accuracy: 0.7678 - loss: 0.5089 - val_accuracy: 0.7748 - val_loss: 0.5175
Epoch 9/200
116/116 - 0s - 3ms/step - accuracy: 0.7697 - loss: 0.5111 - val_accuracy: 0.7737 - val_loss: 0.5139
Epo

In [37]:
prediction_results(model=stacking_model)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)
  data[column].fillna(data[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace meth

Target Balance:
 Transported
False    1.0
Name: proportion, dtype: float64
134/134 - 0s - 1ms/step
bool
[[ True]
 [False]
 [ True]
 ...
 [ True]
 [ True]
 [ True]]


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
