In [128]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.preprocessing import PowerTransformer, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier
)
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, roc_auc_score
from lightgbm import LGBMClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import pickle

warnings.filterwarnings("ignore")

In [129]:
# highest accuracy model
# model = LGBMClassifier(verbose=-1)
model = GradientBoostingClassifier()

In [130]:
# Load data
excel_file_path = "./train.csv"
df = pd.read_csv(excel_file_path, encoding="latin-1")
df = pd.DataFrame(df)

In [131]:
def extract_first_last(df):
    df[['deck', 'num', 'side']] = df['Cabin'].str.split('/', expand=True)
    df['group'] = df['PassengerId'].str[:4]
    df['family_size'] = [list(df['group']).count(x) for x in list(df['group'])]
    df['Age_Cat'] = pd.cut(df['Age'], bins=[0, 18, 30, 50, 80], labels=['Child', 'Young Adult', 'Adult', 'Senior'])
    df['room X Age'] = df['RoomService']*df['Age']
    return df

df = extract_first_last(df)
df.columns
df = df.drop_duplicates()
df.replace('', np.nan, inplace=True)
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,VRDeck,Name,Transported,deck,num,side,group,family_size,Age_Cat,room X Age
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,0.0,Maham Ofracculy,False,B,0,P,1,1,Adult,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,44.0,Juanna Vines,True,F,0,S,2,1,Young Adult,2616.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,49.0,Altark Susent,False,A,0,S,3,2,Senior,2494.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,193.0,Solam Susent,False,A,0,S,3,2,Adult,0.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,2.0,Willy Santantines,True,F,1,S,4,1,Child,4848.0


In [132]:
# Define features and target
def get_X_Y(df):
    X = df.drop(columns=["PassengerId", "Name", "Transported", "Cabin", "group", "num"]) # , "num", "side", "family_size"
    Y = df["Transported"]
    return X, Y

X, Y = get_X_Y(df)

In [136]:
plt.figure(figsize=(14, 6))
cat_feat = ['Age_Cat']
num_feat = ['room X Age']
# Assuming df is your DataFrame
combined_df = df.copy()

# Fill categorical features with their mode
for cat in cat_feat:
    mode_value = combined_df[cat].mode()[0]
    combined_df[cat] = combined_df[cat].fillna(mode_value)

# Fill numerical features with their mode
for num in num_feat:
    mode_value = combined_df[num].mode()[0]
    combined_df[num] = combined_df[num].fillna(mode_value)

# Plot
plt.figure(figsize=(14, 6))

# Box plot
plt.subplot(1, 2, 1)
sns.boxplot(x=cat_feat[0], y=num_feat[0], data=combined_df, palette="Set3")
plt.title(f'Box plot of {num_feat[0]} by {cat_feat[0]}')

# Swarm plot
plt.subplot(1, 2, 2)
sns.swarmplot(x=cat_feat[0], y=num_feat[0], data=combined_df, palette="Set3", alpha=0.5)
plt.title(f'Swarm plot of {num_feat[0]} by {cat_feat[0]}')

plt.tight_layout()
plt.show()

<Figure size 1400x600 with 0 Axes>

KeyboardInterrupt: 

In [None]:
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)
# Check columns
print(X_train.columns, X_train.shape)

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'deck', 'side',
       'family_size', 'Age_Cat', 'room X Age'],
      dtype='object') (6954, 15)


In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6954 entries, 5293 to 2915
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   HomePlanet    6798 non-null   object  
 1   CryoSleep     6784 non-null   object  
 2   Destination   6809 non-null   object  
 3   Age           6822 non-null   float64 
 4   VIP           6782 non-null   object  
 5   RoomService   6802 non-null   float64 
 6   FoodCourt     6806 non-null   float64 
 7   ShoppingMall  6779 non-null   float64 
 8   Spa           6802 non-null   float64 
 9   VRDeck        6805 non-null   float64 
 10  deck          6797 non-null   object  
 11  side          6797 non-null   object  
 12  family_size   6954 non-null   int64   
 13  Age_Cat       6676 non-null   category
 14  room X Age    6671 non-null   float64 
dtypes: category(1), float64(7), int64(1), object(6)
memory usage: 821.9+ KB


In [None]:
# Get the list of numerical column names
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Get the list of categorical column names
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
# separte one hot and ordinal
categorical_features_ordinal = ['VIP', 'Age_Cat']
categorical_features_onehot = list(set(categorical_features)-set(categorical_features_ordinal))
print(categorical_features_ordinal, categorical_features_onehot)

['VIP', 'Age_Cat'] ['side', 'CryoSleep', 'Destination', 'deck', 'HomePlanet']


In [None]:
X_train.isnull().sum()

HomePlanet      156
CryoSleep       170
Destination     145
Age             132
VIP             172
RoomService     152
FoodCourt       148
ShoppingMall    175
Spa             152
VRDeck          149
deck            157
side            157
family_size       0
Age_Cat         278
room X Age      283
dtype: int64

In [None]:
# Get unique elements for each column
for x in categorical_features:
    print(x, X_train[x].unique(), len(X_train[x].unique()))

HomePlanet ['Europa' 'Earth' 'Mars' nan] 4
CryoSleep [False True nan] 3
Destination ['55 Cancri e' 'TRAPPIST-1e' nan 'PSO J318.5-22'] 4
VIP [False nan True] 3
deck ['C' 'F' 'D' 'E' 'G' 'B' nan 'A' 'T'] 9
side ['P' 'S' nan] 3
Age_Cat ['Young Adult', 'Adult', 'Child', 'Senior', NaN]
Categories (4, object): ['Child' < 'Young Adult' < 'Adult' < 'Senior'] 5


In [None]:
import pandas as pd
from pandas_profiling import ProfileReport
def gen_eda():
    profile = ProfileReport(pd.concat([X_train, Y_train], axis=1), title='Pandas Profiling Report', explorative=True)
    profile.to_file("pandas_profiling_report.html") 
# gen_eda()

In [None]:
# Separate transformers for categorical and numerical features

categorical_transformer_onehot = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
categorical_transformer_ordinal = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OrdinalEncoder())
])
numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler()),
    ('power', PowerTransformer())
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer_onehot, categorical_features_onehot),
        ('cat_1', categorical_transformer_ordinal, categorical_features_ordinal),
        ('num', numerical_transformer, numerical_features)
    ]
)

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

In [None]:
# Fit the pipeline on the training data
pipeline.fit(X_train, Y_train)

In [None]:
# # Combine X_train and Y_train into a single DataFrame
# X_train_processed = pipeline.named_steps['preprocessor'].transform(X_train)
# combined_df = pd.DataFrame(X_train_processed.copy())  # Create a copy of X_train
# combined_df['Transported'] = list(Y_train.copy())  # Add the target column

In [None]:
# Save the fitted pipeline as a .pkl file
filename_pkl = "model.pkl"
pickle.dump(pipeline, open(filename_pkl, "wb"))
print(f"Model saved as {filename_pkl}")

Model saved as model.pkl


In [None]:
# Evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7935595169637722


In [None]:
print(classification_report(Y_test, y_pred))
print(f"ROC-AUC Score: {roc_auc_score(Y_test, y_pred)}")

              precision    recall  f1-score   support

       False       0.82      0.75      0.79       875
        True       0.77      0.83      0.80       864

    accuracy                           0.79      1739
   macro avg       0.80      0.79      0.79      1739
weighted avg       0.80      0.79      0.79      1739

ROC-AUC Score: 0.7938167989417989


In [None]:
cross_val_score(pipeline, X_test, Y_test, cv=3, scoring="accuracy").mean()

0.7803396660909615

In [None]:
import pandas as pd
import numpy as np
import pickle

# Load the trained model
loaded_model = pickle.load(open('model.pkl', "rb"))

# Define the columns expected by the model
column_names = X_train.columns

def generate_submission(test_file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(test_file)
    df = pd.DataFrame(df)
    # Replace empty strings with NaN
    df.replace('', np.nan, inplace=True)
    df = extract_first_last(df)
    # Select the relevant columns
    filtered_df = df[column_names]
    predictions = pipeline.predict(filtered_df)
    # Load the original test file to keep the PassengerId column
    original_df = pd.read_csv(test_file)
    original_df['Transported'] = predictions
    # Save the results to a new CSV file
    submission_df = original_df[['PassengerId', 'Transported']]
    submission_df.to_csv('submission.csv', index=False)
    print("Submission file saved as 'submission.csv'")


# Generate the submission
test_file = 'test.csv'
generate_submission(test_file)

KeyboardInterrupt: 