In [49]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.preprocessing import (
    PowerTransformer,
    StandardScaler,
    OneHotEncoder,
    OrdinalEncoder,
)
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    AdaBoostClassifier,
    StackingClassifier,
    VotingClassifier,
)
from xgboost import XGBClassifier
from sklearn.svm import SVC, OneClassSVM
from sklearn.neural_network import MLPClassifier

# from hmmlearn import hmm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, roc_auc_score
from lightgbm import LGBMClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import pickle

warnings.filterwarnings("ignore")

In [50]:
# highest accuracy model
# model = LGBMClassifier(verbose=-1)
# model = HistGradientBoostingClassifier()
# model = RandomForestClassifier()
# model = GradientBoostingClassifier()
# model = AdaBoostClassifier()
# model = MLPClassifier()

In [51]:
voting_clf = VotingClassifier(
    estimators=[
        ("ab", AdaBoostClassifier()),
        ("gb", GradientBoostingClassifier()),
        ("lgbm", LGBMClassifier(verbose=-1)),
    ],
    voting="hard",  # 'hard' for majority voting, 'soft' for weighted average probabilities
)

In [52]:
from sklearn.model_selection import GridSearchCV


model = GradientBoostingClassifier()

# # Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'max_depth': [3, 5, 7],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'subsample': [0.8, 0.9, 1.0]
# }

# # Initialize the model
# model = GradientBoostingClassifier(random_state=42)

# # Initialize GridSearchCV
# model = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

In [53]:
# Load data
excel_file_path = "./train.csv"
df = pd.read_csv(excel_file_path, encoding="latin-1")

In [54]:
def extract_first_last(df):
    df[["deck", "num", "side"]] = df["Cabin"].str.split("/", expand=True)
    df["group"] = df["PassengerId"].str[:4]
    df["family_size"] = [list(df["group"]).count(x) for x in list(df["group"])]
    df["Age_Cat"] = pd.cut(
        df["Age"],
        bins=[0, 18, 30, 50, 80],
        labels=["Child", "Young Adult", "Adult", "Senior"],
    )
    return df


df = extract_first_last(df)
df.columns
df = df.drop_duplicates()
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,deck,num,side,group,family_size,Age_Cat
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P,1,1,Adult
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S,2,1,Young Adult
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S,3,2,Senior
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S,3,2,Adult
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S,4,1,Child


In [55]:
# Columns to check
columns_to_check = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
# Filter rows where all specified columns are zero
rows_to_remove = df[df[columns_to_check].eq(0).all(axis=1)]
rows_to_remove = pd.DataFrame(rows_to_remove.iloc[:2000])
print(len(df), len(rows_to_remove))
df = df[~df.index.isin(rows_to_remove.index)]
print(len(df))

8693 2000
6693


In [56]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,family_size
count,6558.0,6512.0,6510.0,6485.0,6510.0,6505.0,6693.0
mean,29.948155,293.694871,598.807527,227.307941,406.726728,398.584166,1.935455
std,13.746648,748.855287,1819.49301,682.835832,1284.614602,1295.743993,1.537347
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,21.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,28.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,38.0,212.25,306.75,123.0,207.0,188.0,2.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0,8.0


In [57]:
# Define features and target
def get_X_Y(df):
    X = df.drop(
        columns=["PassengerId", "Name", "Transported", "Cabin", "group", "num"]
    )  # , "num", "side", "family_size"
    Y = df["Transported"]
    return X, Y


X, Y = get_X_Y(df)

In [58]:
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.20, random_state=5
)
# Check columns
# X_train, X_test = X,X
# Y_train, Y_test = Y,Y
print(X_train.columns, X_train.shape)

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'deck', 'side',
       'family_size', 'Age_Cat'],
      dtype='object') (5354, 14)


In [59]:
# Get the list of numerical column names
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Get the list of categorical column names
categorical_features = X_train.select_dtypes(
    include=["object", "category"]
).columns.tolist()

In [60]:
# separte one hot and ordinal
categorical_features_ordinal = ["VIP", "Age_Cat"]
categorical_features_onehot = list(
    set(categorical_features) - set(categorical_features_ordinal)
)
print(categorical_features_ordinal, categorical_features_onehot)

['VIP', 'Age_Cat'] ['side', 'Destination', 'CryoSleep', 'deck', 'HomePlanet']


In [61]:
X_train.isnull().sum()

HomePlanet      128
CryoSleep       133
Destination     120
Age             113
VIP             122
RoomService     147
FoodCourt       152
ShoppingMall    157
Spa             138
VRDeck          160
deck            110
side            110
family_size       0
Age_Cat         175
dtype: int64

In [62]:
# Get unique elements for each column
for x in categorical_features:
    print(x, X_train[x].unique(), len(X_train[x].unique()))

HomePlanet ['Mars' 'Earth' 'Europa' nan] 4
CryoSleep [False True nan] 3
Destination ['TRAPPIST-1e' '55 Cancri e' nan 'PSO J318.5-22'] 4
VIP [False True nan] 3
deck ['F' 'G' 'B' 'C' 'A' 'D' nan 'E' 'T'] 9
side ['P' 'S' nan] 3
Age_Cat ['Young Adult', 'Adult', 'Child', 'Senior', NaN]
Categories (4, object): ['Child' < 'Young Adult' < 'Adult' < 'Senior'] 5


In [63]:
import pandas as pd
from pandas_profiling import ProfileReport


def gen_eda():
    profile = ProfileReport(
        pd.concat([X_train, Y_train], axis=1),
        title="Pandas Profiling Report",
        explorative=True,
    )
    profile.to_file("pandas_profiling_report.html")


# gen_eda()

In [64]:
# Separate transformers for categorical and numerical features

from sklearn.pipeline import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

trf = PowerTransformer()

# def square(x):
#     return x ** 2
# trf = FunctionTransformer(func=square, validate=True)


categorical_transformer_onehot = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)
categorical_transformer_ordinal = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OrdinalEncoder()),
    ]
)
numerical_transformer = Pipeline(
    steps=[
        (
            "imputer",
            KNNImputer(n_neighbors=5),
        ),  # KNNImputer(n_neighbors=5) SimpleImputer(strategy='mean')
        ("log", trf),
        ("scaler", StandardScaler()),  # StandardScaler MinMaxScaler
    ]
)

In [65]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer_onehot, categorical_features_onehot),
        ("cat_1", categorical_transformer_ordinal, categorical_features_ordinal),
        ("num", numerical_transformer, numerical_features),
    ]
)

# Define the pipeline
pipeline = Pipeline([("preprocessor", preprocessor), ("model", model)])

In [66]:
# Fit the pipeline on the training data
pipeline.fit(X_train, Y_train)

In [67]:
# # Combine X_train and Y_train into a single DataFrame
# X_train_processed = pipeline.named_steps['preprocessor'].transform(X_train)
# combined_df = pd.DataFrame(X_train_processed.copy())  # Create a copy of X_train
# combined_df['Transported'] = list(Y_train.copy())  # Add the target column

In [68]:
# Save the fitted pipeline as a .pkl file
filename_pkl = "model.pkl"
pickle.dump(pipeline, open(filename_pkl, "wb"))
print(f"Model saved as {filename_pkl}")

Model saved as model.pkl


In [69]:
# Evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.806572068707991


In [70]:
print(classification_report(Y_test, y_pred))
print(f"ROC-AUC Score: {roc_auc_score(Y_test, y_pred)}")

              precision    recall  f1-score   support

       False       0.82      0.84      0.83       765
        True       0.78      0.76      0.77       574

    accuracy                           0.81      1339
   macro avg       0.80      0.80      0.80      1339
weighted avg       0.81      0.81      0.81      1339

ROC-AUC Score: 0.8004884880781582


In [71]:
cross_val_score(pipeline, X_test, Y_test, cv=3, scoring="accuracy").mean()

0.8035750711436148

In [72]:
import pandas as pd
import numpy as np
import pickle

# Load the trained model
loaded_model = pickle.load(open("model.pkl", "rb"))

# Define the columns expected by the model
column_names = X_train.columns


def generate_submission(test_file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(test_file)
    df = pd.DataFrame(df)
    # Replace empty strings with NaN
    df.replace("", np.nan, inplace=True)
    df = extract_first_last(df)
    # Select the relevant columns
    filtered_df = df[column_names]
    predictions = pipeline.predict(filtered_df)
    # Load the original test file to keep the PassengerId column
    original_df = pd.read_csv(test_file)
    original_df["Transported"] = predictions
    # Save the results to a new CSV file
    submission_df = original_df[["PassengerId", "Transported"]]
    submission_df.to_csv("submission.csv", index=False)
    print("Submission file saved as 'submission.csv'")


# Generate the submission
test_file = "test.csv"
generate_submission(test_file)

Submission file saved as 'submission.csv'
