In [121]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np
from sklearn.preprocessing import PowerTransformer
import warnings

warnings.filterwarnings("ignore")

# Load data
# watanbe_zakaria
excel_file_path = f"./train.csv"
df = pd.read_csv(excel_file_path, encoding="latin-1")

In [122]:
df = pd.DataFrame(df)

# Get unique elements for each column
unique_elements = df.apply(pd.unique)

# Convert to dictionary for better readability
unique_elements_dict = unique_elements.to_dict()

print(unique_elements_dict)

{'PassengerId': array(['0001_01', '0002_01', '0003_01', ..., '9279_01', '9280_01',
       '9280_02'], dtype=object), 'HomePlanet': array(['Europa', 'Earth', 'Mars', nan], dtype=object), 'CryoSleep': array([False, True, nan], dtype=object), 'Cabin': array(['B/0/P', 'F/0/S', 'A/0/S', ..., 'G/1499/S', 'G/1500/S', 'E/608/S'],
      dtype=object), 'Destination': array(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', nan], dtype=object), 'Age': array([39., 24., 58., 33., 16., 44., 26., 28., 35., 14., 34., 45., 32.,
       48., 31., 27.,  0.,  1., 49., 29., 10.,  7., 21., 62., 15., 43.,
       47.,  2., 20., 23., 30., 17., 55.,  4., 19., 56., nan, 25., 38.,
       36., 22., 18., 42., 37., 13.,  8., 40.,  3., 54.,  9.,  6., 64.,
       67., 61., 50., 41., 57., 11., 52., 51., 46., 60., 63., 59.,  5.,
       79., 68., 74., 12., 53., 65., 71., 75., 70., 76., 78., 73., 66.,
       69., 72., 77.]), 'VIP': array([False, True, nan], dtype=object), 'RoomService': array([   0.,  109.,   43., ..., 1569.,

In [123]:
df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [124]:
# Define features and target


def get_X_Y(df):
    X = df.drop(
        columns=[
            "PassengerId",	"VIP",	"Name"
        ]
    )
    Y = df["Transported"]
    return X, Y

In [125]:
X, Y = get_X_Y(df)

In [126]:
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

In [127]:
X_train.shape, X_test.shape

((6954, 11), (1739, 11))

In [128]:
X_train.isnull().sum()

HomePlanet      156
CryoSleep       170
Cabin           157
Destination     145
Age             132
RoomService     152
FoodCourt       148
ShoppingMall    175
Spa             152
VRDeck          149
Transported       0
dtype: int64

In [129]:
X_train = X_train.dropna()
print(X_train.shape)

(5541, 11)


In [130]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, OrdinalEncoder

# Define preprocessing pipeline with both ordinal and one-hot encoding

preprocessor = ColumnTransformer(
    [
        (
            "ordinal",
            OrdinalEncoder(),
            [1],
        ),
        (
            "onehot",
            OneHotEncoder(handle_unknown="ignore"),
            [0,3],
        ),
        ("numeric", StandardScaler(), [4,5,6,7,8,9]),
        ("power", PowerTransformer(), [4,5,6,7,8,9]),
        # ("imputer", KNNImputer(), [0,1,2,3,4,5,6,7,8,9]),
    ]
)

In [131]:
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier
)
from lightgbm import LGBMClassifier


# Define model
# model = LogisticRegression()
# model = tree.DecisionTreeClassifier()
model = RandomForestClassifier()
# model = ExtraTreesClassifier()
# model = GradientBoostingClassifier()
# model = HistGradientBoostingClassifier()
# model = LGBMClassifier(objective='multiclass', random_state=5)


In [132]:
import pickle
import numpy as np

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Fit the pipeline on the training data
pipeline.fit(X_train, Y_train)

# Save the fitted pipeline as a .pkl file
filename_pkl = f"model.pkl"

pickle.dump(pipeline, open(filename_pkl, "wb"))
print(f"Model saved as {filename_pkl}")

ValueError: Found input variables with inconsistent numbers of samples: [5541, 6954]

In [None]:
from sklearn.metrics import accuracy_score

y_pred = pipeline.predict(X_test)
accuracy_score(Y_test, y_pred)