## Import Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR, SVC
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier

In [2]:
dataset = pd.read_csv('train.csv')
testset = pd.read_csv('test.csv')

In [3]:
X = dataset.iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]
y = dataset.iloc[:, -1]

test_set = testset.iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]
label = testset.iloc[:, 0]

In [4]:
label

0       0013_01
1       0018_01
2       0019_01
3       0021_01
4       0023_01
         ...   
4272    9266_02
4273    9269_01
4274    9271_01
4275    9273_01
4276    9277_01
Name: PassengerId, Length: 4277, dtype: object

In [5]:
def parse_cabin(cabin):
    if pd.isnull(cabin):
        return np.nan, np.nan, np.nan  
    
    parts = cabin.split('/')
    if len(parts) == 3:
        deck, room, side = parts
    else:
        deck, room, side = np.nan, np.nan, np.nan  
    
    return deck, room, side

In [6]:
X['Deck'], X['Room'], X['Side'] = zip(*X['Cabin'].apply(parse_cabin))
X = X.drop('Cabin', axis=1)
feature_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 
                   'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 
                   'Deck', 'Room', 'Side'] 

features_array = X[feature_columns].to_numpy()
features_array

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Deck'], X['Room'], X['Side'] = zip(*X['Cabin'].apply(parse_cabin))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Deck'], X['Room'], X['Side'] = zip(*X['Cabin'].apply(parse_cabin))


array([['Europa', False, 'TRAPPIST-1e', ..., 'B', '0', 'P'],
       ['Earth', False, 'TRAPPIST-1e', ..., 'F', '0', 'S'],
       ['Europa', False, 'TRAPPIST-1e', ..., 'A', '0', 'S'],
       ...,
       ['Earth', False, 'TRAPPIST-1e', ..., 'G', '1500', 'S'],
       ['Europa', False, '55 Cancri e', ..., 'E', '608', 'S'],
       ['Europa', False, 'TRAPPIST-1e', ..., 'E', '608', 'S']],
      dtype=object)

In [7]:
features_array[1]

array(['Earth', False, 'TRAPPIST-1e', 24.0, False, 109.0, 9.0, 25.0,
       549.0, 44.0, 'F', '0', 'S'], dtype=object)

In [8]:
categorical_cols = [0, 1, 2, 4, 10, 12]  

numerical_cols = [3, 5, 6, 7, 8, 9, 11]  

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

X_transformed = preprocessor.fit_transform(X)

In [9]:
X_transformed[1]

array([-0.33671733, -0.1753636 , -0.28166908, -0.24896783,  0.2115053 ,
       -0.23019432, -1.18662707,  1.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  1.        ])

In [10]:
test_set['Deck'], test_set['Room'], test_set['Side'] = zip(*test_set['Cabin'].apply(parse_cabin))
test_set = test_set.drop('Cabin', axis=1)

feature_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 
                   'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 
                   'Deck', 'Room', 'Side'] 

test_set_transformed = preprocessor.transform(test_set[feature_columns])

test_features_array = test_set_transformed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set['Deck'], test_set['Room'], test_set['Side'] = zip(*test_set['Cabin'].apply(parse_cabin))


In [11]:
le = LabelEncoder()
y = np.array(le.fit_transform(y))
y

array([0, 1, 0, ..., 1, 0, 1])

In [12]:
clf = CatBoostClassifier(
    iterations=5, 
    learning_rate=0.1, 
    loss_function='CrossEntropy'
)
clf.fit(X_transformed, y)

0:	learn: 0.6521265	total: 61.7ms	remaining: 247ms
1:	learn: 0.6111106	total: 63.8ms	remaining: 95.7ms
2:	learn: 0.5834594	total: 65.9ms	remaining: 43.9ms
3:	learn: 0.5615645	total: 67.9ms	remaining: 17ms
4:	learn: 0.5421448	total: 69.9ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1339130d0>

In [13]:
y_pred = clf.predict(test_set_transformed)

In [14]:
threshold = 0.5
y_pred_binary = (y_pred > threshold).astype(int)

y_pred_binary

array([1, 0, 1, ..., 1, 1, 1])

In [15]:
result_df = pd.DataFrame({
    'PassengerID': label,
    'Transported': y_pred_binary
})
result_df['Transported'] = result_df['Transported'].astype(bool)
result_df.to_csv('predictions1.csv', index=False)