In [446]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.preprocessing import PowerTransformer, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier
)
from lightgbm import LGBMClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import pickle

warnings.filterwarnings("ignore")

In [447]:
# highest accuracy model
# model = LGBMClassifier(verbose=-1)
model = GradientBoostingClassifier()

In [448]:
# Load data
excel_file_path = "./train.csv"
df = pd.read_csv(excel_file_path, encoding="latin-1")
df = pd.DataFrame(df)

In [449]:
def extract_first_last(df):
    df[['deck', 'num', 'side']] = df['Cabin'].str.split('/', expand=True)
    df['group'] = df['PassengerId'].str[:4]
    df['family_size'] = [list(df['group']).count(x) for x in list(df['group'])]
    df['Age_Cat'] = pd.cut(df['Age'], bins=[0, 18, 30, 50, 80], labels=['Child', 'Young Adult', 'Adult', 'Senior'])
    return df

df = extract_first_last(df)
df.columns
df = df.drop_duplicates()
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,deck,num,side,group,family_size,Age_Cat
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P,1,1,Adult
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S,2,1,Young Adult
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S,3,2,Senior
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S,3,2,Adult
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S,4,1,Child


In [450]:
# Define features and target
def get_X_Y(df):
    X = df.drop(columns=["PassengerId", "Name", "Transported", "Cabin", "group", "num"]) # , "num", "side", "family_size"
    Y = df["Transported"]
    return X, Y

X, Y = get_X_Y(df)

In [451]:
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)
# Check columns
print(X_train.columns, X_train.shape)

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'deck', 'side',
       'family_size', 'Age_Cat'],
      dtype='object') (6954, 14)


In [452]:
# Get the list of numerical column names
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Get the list of categorical column names
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

In [453]:
# separte one hot and ordinal
categorical_features_ordinal = ['VIP', 'Age_Cat']
categorical_features_onehot = list(set(categorical_features)-set(categorical_features_ordinal))
print(categorical_features_ordinal, categorical_features_onehot)

['VIP', 'Age_Cat'] ['Destination', 'side', 'CryoSleep', 'deck', 'HomePlanet']


In [454]:
X_train.isnull().sum()

HomePlanet      156
CryoSleep       170
Destination     145
Age             132
VIP             172
RoomService     152
FoodCourt       148
ShoppingMall    175
Spa             152
VRDeck          149
deck            157
side            157
family_size       0
Age_Cat         278
dtype: int64

In [455]:
def show_graph(col_list):
    for feature in col_list:
        plt.figure(figsize=(14, 4))
        plt.subplot(121)
        sns.histplot(X_train[feature], kde=True, stat="density", log_scale=False)
        plt.title(f"{feature} after bining")
        plt.show()

# show_graph(numerical_features)

In [456]:
# Get unique elements for each column
for x in categorical_features:
    print(x, X_train[x].unique(), len(X_train[x].unique()))

HomePlanet ['Europa' 'Earth' 'Mars' nan] 4
CryoSleep [False True nan] 3
Destination ['55 Cancri e' 'TRAPPIST-1e' nan 'PSO J318.5-22'] 4
VIP [False nan True] 3
deck ['C' 'F' 'D' 'E' 'G' 'B' nan 'A' 'T'] 9
side ['P' 'S' nan] 3
Age_Cat ['Young Adult', 'Adult', 'Child', 'Senior', NaN]
Categories (4, object): ['Child' < 'Young Adult' < 'Adult' < 'Senior'] 5


In [457]:
import pandas as pd
from pandas_profiling import ProfileReport
def gen_eda():
    profile = ProfileReport(pd.concat([X_train, Y_train], axis=1), title='Pandas Profiling Report', explorative=True)
    profile.to_file("pandas_profiling_report.html") 
# gen_eda()

In [458]:
# Separate transformers for categorical and numerical features

categorical_transformer_onehot = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
categorical_transformer_ordinal = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OrdinalEncoder())
])
numerical_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(estimator= LinearRegression(), max_iter=10000, tol=1e-10, imputation_order='random')),
    ('scaler', StandardScaler()),
    ('power', PowerTransformer())
])

In [459]:
# Combine into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer_onehot, categorical_features_onehot),
        ('cat_1', categorical_transformer_ordinal, categorical_features_ordinal),
        ('num', numerical_transformer, numerical_features)
    ]
)

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    # ('select k best', SelectKBest(score_func=chi2, k=8)),
    ('model', model)
])

In [460]:
# Fit the pipeline on the training data
pipeline.fit(X_train, Y_train)

ValueError: Input X must be non-negative.

In [None]:
# Save the fitted pipeline as a .pkl file
filename_pkl = "model.pkl"
pickle.dump(pipeline, open(filename_pkl, "wb"))
print(f"Model saved as {filename_pkl}")

Model saved as model.pkl


In [None]:
# Evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7901092581943646


In [None]:
from sklearn.metrics import classification_report, roc_auc_score


print(classification_report(Y_test, y_pred))
print(f"ROC-AUC Score: {roc_auc_score(Y_test, y_pred)}")

              precision    recall  f1-score   support

       False       0.82      0.75      0.78       875
        True       0.77      0.83      0.80       864

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739

ROC-AUC Score: 0.7903736772486772


In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(pipeline, X_test, Y_test, cv=3, scoring="accuracy").mean()

0.7809193417108372

In [None]:
import pandas as pd
import numpy as np
import pickle

# Load the trained model
loaded_model = pickle.load(open('model.pkl', "rb"))

# Define the columns expected by the model
column_names = X_train.columns

def generate_submission(test_file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(test_file)
    df = pd.DataFrame(df)
    # Replace empty strings with NaN
    df.replace('', np.nan, inplace=True)
    df = extract_first_last(df)
    # Select the relevant columns
    filtered_df = df[column_names]
    predictions = pipeline.predict(filtered_df)
    # Load the original test file to keep the PassengerId column
    original_df = pd.read_csv(test_file)
    original_df['Transported'] = predictions
    # Save the results to a new CSV file
    submission_df = original_df[['PassengerId', 'Transported']]
    submission_df.to_csv('submission.csv', index=False)
    print("Submission file saved as 'submission.csv'")


# Generate the submission
test_file = 'test.csv'
generate_submission(test_file)

Submission file saved as 'submission.csv'
