## IMPORTS

In [None]:
!pip install scikit-learn-intelex

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#charts
import seaborn as sns
import matplotlib.pyplot as plt

#Models
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import RidgeClassifier

#others
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings( 'ignore' )

plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = (15,7)

## File and Data Field Descriptions

**train.csv** - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.

**PassengerId** - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

**HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.

**CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

**Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

**Destination** - The planet the passenger will be debarking to.

**Age** - The age of the passenger.

**VIP** - Whether the passenger has paid for special VIP service during the voyage.

**RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

**Name** - The first and last names of the passenger.

**Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

## READ FILE

In [None]:
sub = "/kaggle/input/spaceship-titanic/sample_submission.csv"
train = "/kaggle/input/spaceship-titanic/train.csv"
test = "/kaggle/input/spaceship-titanic/test.csv"

In [None]:
train = pd.read_csv(train)
test = pd.read_csv(test)
sub = pd.read_csv(sub)

train['Transported'] = train['Transported'].astype(int)

## DESCRIPTION DATA

In [None]:
train.head()

In [None]:
train.shape, test.shape

In [None]:
train.isna().sum()

# **Feature engineering**

In [None]:
TARGET = train[['Transported']]
train1 = train.copy()
test1 = test.copy()

In [None]:
def fill_cat(df: pd.DataFrame):
    col_feat = list(train1.select_dtypes(exclude=['int64', 'float64']).columns)
    col_feat.remove('PassengerId')
    for col in col_feat:
        df[col].fillna(df[col].mode()[0], inplace=True)
    return df
        
def fill_num(df: pd.DataFrame):
    col_feat = list(train1.select_dtypes(include=['int64', 'float64']).columns)
    col_feat.remove('Transported')
    for col in col_feat:
        df[col].fillna(df[col].median(), inplace=True)
    return df
        
def convert_binary(df: pd.DataFrame):
    cols = ['VIP','CryoSleep']
    for col in cols:
        df[col] = df[col].astype(int)
    return df

#off
def log_transform(df: pd.DataFrame):
    col_feat = ['Age','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' ]
    for col in col_feat:
        df[col] = np.log(df[col])
    return df

In [None]:
def fe(df: pd.DataFrame):
    df['deck'] = df['Cabin'].apply(lambda x: x.split('/')[0])
    df['num'] = df['Cabin'].apply(lambda x: x.split('/')[1])
    df['side'] = df['Cabin'].apply(lambda x: x.split('/')[2])
    df['LastName'] = df.Name.str.split(' ').str[1]
    df['group'] = df['PassengerId'].apply(lambda x: x[0:4])
    df['vr_spend']=df['RoomService']+df['FoodCourt']+df['ShoppingMall']+df['Spa']+df['VRDeck']
    df['no_spend'] = df['vr_spend'].apply(lambda row: 1 if row == 0 else 0)
    df['relative_group'] = df.groupby(['group'])['LastName'].transform('count')
    #df['familyCounts'] = df.groupby(['LastName'])['LastName'].transform('count')
    # fill the values with name 'Disivering'
    #df['familyCounts'] =  df['familyCounts'].apply(lambda x: 0 if x > 200 else x)
    df['isChild'] = df['Age'].apply(lambda x: 1 if x < 10 else 0)
    df['isOld'] = df['Age'].apply(lambda x: 1 if x > 60 else 0)
    return df

def drop_cols(df: pd.DataFrame):
    df.drop(['Name', 'LastName','group'], axis=1, inplace=True)
    return df

#OFF
def reduce_cardinality(df: pd.DataFrame ):
    col_ = ['Cabin', 'deck','num', 'side']
    for col in col_:
        #get a list of categories which contain one value
        tmp=list(df[col].value_counts()[df[col].value_counts()<2].index)
        #Replace with 'other'
        df[col]=df[col].apply(lambda x:'other' if x in tmp else x)
    return df

In [None]:
%%time
train1 = (train1.pipe(fill_cat).pipe(fill_num).pipe(convert_binary))
test1 = (test1.pipe(fill_cat).pipe(fill_num).pipe(convert_binary))

train1 = (train1.pipe(fe).pipe(drop_cols))
test1 = (test1.pipe(fe).pipe(drop_cols))

In [None]:
train1.head()

# **EDA**

## **LABEL ENCODER**

In [None]:
cats = list(train1.select_dtypes(exclude=['int64', 'float64']).columns)
cats.remove('PassengerId')
for i in cats:
    print(i)
    le=LabelEncoder()
    arr=np.concatenate((train1[i], test1[i])).astype(str)
    le.fit(arr)
    train1[i]=le.transform(train1[i].astype(str))
    test1[i]=le.transform(test1[i].astype(str))

## **TARGET**

In [None]:
g = sns.countplot(x = 'Transported', data = train , palette=['c' if c == True else 'r' for c in train['Transported']]);
for p in g.patches:
    height = p.get_height()
    g.text(p.get_x()+p.get_width()/2.,
            height/2,
            '{}'.format(round(height)),
            ha="center", color='white')
plt.title('TARGET');

**We have a balanced class**

In [None]:
num_feat = train1.select_dtypes(exclude='object')
num_feat.drop('Transported', axis=1, inplace=True)

num_feat.describe()

 ## **Distribution**

In [None]:
plt.figure(figsize=(15,10));
for i, feat in enumerate(num_feat):
    plt.subplot( 8, 3, i + 1 );
    sns.distplot(x=num_feat[feat], color = 'c')
    plt.xlabel(feat);
    plt.tight_layout();

**As we can see, some columns doesnt have a normal curve.**

## **MULTIVARIETED ANALYSIS**

In [None]:
plt.figure(figsize=(20,9));
num_corr = num_feat.corr();
sns.heatmap(num_corr, annot=True);

## **CHECK OUTLIERS**

In [None]:
plt.figure(figsize=(15,10));
for i, feat_num in enumerate(num_feat):
    plt.subplot( 8, 3, i + 1 );
    sns.boxplot(x=num_feat[feat_num], color = 'c')
    plt.xlabel(feat_num);
    plt.tight_layout()

**I used log transform but it didn't work very well**

## **STANDARD SCALER**

In [None]:
X = train1.drop(['PassengerId', 'Transported'],axis=1)
X_test1 = test1.drop('PassengerId', axis=1)
y = TARGET

cols = list(X.columns)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X[cols] = sc.fit_transform(X[cols])
X_test1[cols] = sc.transform(X_test1[cols])

## **VOTING CLASSIFIER**

In [None]:
KNN_param = {'metric': 'manhattan', 'n_neighbors': 25, 'weights': 'distance'}

XGB_param = {'gamma': 0.8151728866167003,
 'learning_rate': 0.031628174313413464,
 'max_depth': 7,
 'n_estimators': 207,
 'subsample': 0.7781385659303335}

LGBM_param = {'learning_rate': 0.04183147620569966,
              'max_depth': 25,
              'min_child_samples': 117,
              'n_estimators': 240,
               #'num_leavesj': 2201
             }

RIDGE_param = {'alpha': 5.4827342413378775,'fit_intercept': True,'solver': 'lsqr', 'tol': 0.002738610271304144}

MLP_params =  {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (10, 30, 10), 'learning_rate': 'constant', 'solver': 'adam'}

cat = {'objective': 'CrossEntropy',
 'colsample_bylevel': 0.07587945476302646,
 'depth': 8,
 'boosting_type': 'Ordered',
 'bootstrap_type': 'Bernoulli',
 'subsample': 0.737265320016441}

**I used optuna to tune the parameters of the models**

In [None]:
CB = CatBoostClassifier(**cat,iterations=1000,verbose=0)
LGBM = LGBMClassifier(**LGBM_param)
RD = RidgeClassifier(**RIDGE_param)
XGB =XGBClassifier(**XGB_param)
MLP = MLPClassifier(**MLP_params)
KNN = KNeighborsClassifier(**KNN_param)

In [None]:
voting_classifiers = [("CB", CB),
                      ("LGBM", LGBM),
                      ("RD", RD),
                      ("XGB", XGB),
                      ("MLP", MLP),
                     ("KNN",KNN)
                     ]

voting = VotingClassifier(estimators=voting_classifiers, voting="hard")

In [None]:
%%time
cv = StratifiedKFold(n_splits=5)
for train_index, test_index in cv.split(X, y):
    x_train, x_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    voting.fit(x_train,y_train)
    print(voting.score(x_test, y_test))  

In [None]:
y_preds = voting.predict(X_test1)

In [None]:
y_preds

In [None]:
submission = pd.DataFrame({
        "PassengerId": sub["PassengerId"],
        "Transported":  y_preds})

submission['Transported'] = submission['Transported'].astype('bool')
submission.to_csv('submission.csv', index=False)
submission