In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

## Functions

In [21]:
def check_missing_values(data):
    """
    Check for missing values in the dataset.
    """
    missing_values = data.isnull().sum()
    return missing_values[missing_values > 0]

def fill_missing_values(data):
    """
    Fill missing values in the dataset.
    - Drop rows with missing values in critical columns.
    - Fill categorical columns with the mode.
    - Fill numerical columns with the median.
    """
    # Drop rows with missing values in critical columns
    data = data.dropna(subset=['HomePlanet', 'CryoSleep', 'Cabin', 'Destination'])

    # Fill missing categorical values with the mode
    for column in ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']:
        data[column].fillna(data[column].mode()[0], inplace=True)

    # Fill missing numerical values with the median
    for column in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
        data[column].fillna(data[column].median(), inplace=True)

    return data

def feature_engineering(data):
    """
    Perform feature engineering on the dataset.
    - Split 'Cabin' into 'Deck', 'CabinNumber', and 'Side'.
    - Create 'TotalSpend' feature.
    - Create 'AgeGroup' feature.
    """
    # Split 'Cabin' into 'Deck', 'CabinNumber', and 'Side'
    data[['Deck', 'CabinNumber', 'Side']] = data['Cabin'].astype(str).str.split('/', expand=True)
    data['CabinNumber'] = pd.to_numeric(data['CabinNumber'], errors='coerce')
    data = data.drop(columns=['Cabin'])

    # Create 'TotalSpend' feature
    data['TotalSpend'] = (
        data['RoomService'] +
        data['FoodCourt'] +
        data['ShoppingMall'] +
        data['Spa'] +
        data['VRDeck']
    )

    # Create age groups
    bins = [0, 12, 18, 35, 60, 100]
    labels = ['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior']
    data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels)

    return data

def encode_categorical_features(data):
    """
    Encode categorical features using LabelEncoder.
    """
    label_encoder = LabelEncoder()
    for column in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Name', 'Deck', 'Side', 'AgeGroup']:
        data[column] = label_encoder.fit_transform(data[column].astype(str))
    
    return data

def check_target_balance(data, target_column):
    """
    Check the balance of the target variable.
    """
    target_balance = data[target_column].value_counts(normalize=True)
    return target_balance

def preprocess_data(data, target_column='Transported'):
    """
    Perform the full preprocessing pipeline on the dataset.
    """
    # Step 1: Check and fill missing values
    data = fill_missing_values(data)

    # Step 2: Perform feature engineering
    data = feature_engineering(data)

    # Step 3: Encode categorical variables
    data = encode_categorical_features(data)

    # Step 4: Check target variable balance
    target_balance = check_target_balance(data, target_column)
    print("Target Balance:\n", target_balance)

    return data

check_missing_values(spaceship_data)
spaceship_data = preprocess_data(spaceship_data)

In [22]:
file_path = 'train.csv'
spaceship_data = pd.read_csv(file_path)
print(len(spaceship_data))
spaceship_data.head()

8693


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


## Data wrangling
1. Check missing values

In [27]:
missing_values = spaceship_data.isnull().sum()
missing_values[missing_values > 0]

AgeGroup    166
dtype: int64

2. Fill in/ drop missing values

In [4]:
spaceship_data = spaceship_data.dropna(subset=['HomePlanet', 'CryoSleep', 'Cabin', 'Destination'])

# Fill missing categorical values with the mode
for column in ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']:
    spaceship_data[column].fillna(spaceship_data[column].mode()[0], inplace=True)

# Fill missing numerical values with the median
for column in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    spaceship_data[column].fillna(spaceship_data[column].median(), inplace=True)

missing_values_after = spaceship_data.isnull().sum()
missing_values_after[missing_values_after > 0]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  spaceship_data[column].fillna(spaceship_data[column].mode()[0], inplace=True)
  spaceship_data[column].fillna(spaceship_data[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  spaceship_data[column].fillna(spaceship_data[column].median(), inplace=Tru

Series([], dtype: int64)

3. Feature engineering (improves random forrest model from 78% to 80%)

In [5]:
# Splitting the 'Cabin' into  'Deck', 'CabinNumber', and 'Side'
spaceship_data[['Deck', 'CabinNumber', 'Side']] = spaceship_data['Cabin'].astype(str).str.split('/', expand=True)
spaceship_data['CabinNumber'] = pd.to_numeric(spaceship_data['CabinNumber'], errors='coerce')
spaceship_data = spaceship_data.drop(columns=['Cabin'])

# 'TotalSpend' feature  (RoomService, FoodCourt, ShoppingMall, Spa, and VRDeck)
spaceship_data['TotalSpend'] = (
    spaceship_data['RoomService'] +
    spaceship_data['FoodCourt'] +
    spaceship_data['ShoppingMall'] +
    spaceship_data['Spa'] +
    spaceship_data['VRDeck']
)

# Create age groups
bins = [0, 12, 18, 35, 60, 100]
labels = ['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior']
spaceship_data['AgeGroup'] = pd.cut(spaceship_data['Age'], bins=bins, labels=labels)


4. Check target variable balance

In [10]:
# Check the balance of the target variable 'Transported'
target_balance = spaceship_data['Transported'].value_counts(normalize=True)
target_balance


Transported
True     0.503914
False    0.496086
Name: proportion, dtype: float64

3. Encode categorical variables

In [7]:
label_encoder = LabelEncoder()
for column in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Name', 'Deck', 'Side', 'AgeGroup']:
    spaceship_data[column] = label_encoder.fit_transform(spaceship_data[column])
spaceship_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,CabinNumber,Side,TotalSpend,AgeGroup
0,0001_01,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0.0,4794,False,1,0,0,0.0,0
1,0002_01,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44.0,4109,True,5,0,1,736.0,4
2,0003_01,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,412,False,0,0,1,10383.0,0
3,0003_02,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,6503,False,0,0,1,5176.0,4
4,0004_01,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2.0,7583,True,5,1,1,1091.0,3


4. Split train and test data

In [8]:
# Split the dataset into features and target
X = spaceship_data.drop(columns=['PassengerId', 'Transported'])
y = spaceship_data['Transported']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Models 
1. Random forrest 

In [15]:
# Define the parameter grid for RandomForest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit GridSearchCV to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
report_best = classification_report(y_test, y_pred_best)
best_params, accuracy_best, report_best

Fitting 3 folds for each of 324 candidates, totalling 972 fits


324 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
255 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\shami\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\shami\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\shami\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\shami\AppData\Local\Programs\Python\Python312\L

({'max_depth': 20,
  'max_features': 'sqrt',
  'min_samples_leaf': 4,
  'min_samples_split': 10,
  'n_estimators': 200},
 0.8005050505050505,
 '              precision    recall  f1-score   support\n\n       False       0.80      0.80      0.80       794\n        True       0.80      0.80      0.80       790\n\n    accuracy                           0.80      1584\n   macro avg       0.80      0.80      0.80      1584\nweighted avg       0.80      0.80      0.80      1584\n')

2. Ensemble methods (Stacking), improved from 80% to 80.36%

In [20]:
# 1. Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
report_gb = classification_report(y_test, y_pred_gb)

# 2. XGBoost
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
report_xgb = classification_report(y_test, y_pred_xgb)

# 3. Stacking Classifier
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss'))
]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=3)
stacking_model.fit(X_train, y_train)
y_pred_stack = stacking_model.predict(X_test)
accuracy_stack = accuracy_score(y_test, y_pred_stack)
report_stack = classification_report(y_test, y_pred_stack)

# Results
print("Gradient Boosting Accuracy:", accuracy_gb)
print(report_gb)
print("XGBoost Accuracy:", accuracy_xgb)
print(report_xgb)
print("Stacking Classifier Accuracy:", accuracy_stack)
print(report_stack)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Gradient Boosting Accuracy: 0.797979797979798
              precision    recall  f1-score   support

       False       0.81      0.78      0.79       794
        True       0.79      0.82      0.80       790

    accuracy                           0.80      1584
   macro avg       0.80      0.80      0.80      1584
weighted avg       0.80      0.80      0.80      1584

XGBoost Accuracy: 0.7954545454545454
              precision    recall  f1-score   support

       False       0.81      0.78      0.79       794
        True       0.78      0.81      0.80       790

    accuracy                           0.80      1584
   macro avg       0.80      0.80      0.80      1584
weighted avg       0.80      0.80      0.80      1584

Stacking Classifier Accuracy: 0.8036616161616161
              precision    recall  f1-score   support

       False       0.81      0.79      0.80       794
        True       0.80      0.81      0.81       790

    accuracy                           0.80      1