In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

# input missing data 

# Age

In [None]:
age_mean_1 = train[train.Transported == 1]['Age'].mean()
age_mean_0 = train[train.Transported == 0]['Age'].mean()
print(age_mean_0,age_mean_1)
train.loc[train.Transported == 1,'Age'] = train[train.Transported == 1]['Age'].fillna(age_mean_1)
train.loc[train.Transported == 0,'Age'] = train[train.Transported == 0]['Age'].fillna(age_mean_0)

In [None]:
test_age = test.Age.mean()
test.Age.fillna(test_age,inplace=True)

# RoomService

In [None]:
service_mean_1 = train[train.Transported == 1]['RoomService'].mean()
service_mean_0 = train[train.Transported == 0]['RoomService'].mean()
print(service_mean_0,service_mean_1)
train.loc[train.Transported == 1,'RoomService'] = train[train.Transported == 1]['RoomService'].fillna(service_mean_1)
train.loc[train.Transported == 0,'RoomService'] = train[train.Transported == 0]['RoomService'].fillna(service_mean_0)

In [None]:
test_service = test.RoomService.mean()
test.RoomService.fillna(test_service,inplace=True)

# FoodCourt

In [None]:
foodcourt_mean_1 = train[train.Transported == 1]['FoodCourt'].mean()
foodcourt_mean_0 = train[train.Transported == 0]['FoodCourt'].mean()
print(foodcourt_mean_0,foodcourt_mean_1)
train.loc[train.Transported == 1,'FoodCourt'] = train[train.Transported == 1]['FoodCourt'].fillna(foodcourt_mean_1)
train.loc[train.Transported == 0,'FoodCourt'] = train[train.Transported == 0]['FoodCourt'].fillna(foodcourt_mean_0)

In [None]:
test_foodcourt = test.FoodCourt.mean()
test.FoodCourt.fillna(test_foodcourt,inplace=True)

# Shopping Mall

In [None]:
shopping_mean_1 = train[train.Transported == 1]['ShoppingMall'].mean()
shopping_mean_0 = train[train.Transported == 0]['ShoppingMall'].mean()
print(shopping_mean_0,shopping_mean_1)
train.loc[train.Transported == 1,'ShoppingMall'] = train[train.Transported == 1]['ShoppingMall'].fillna(shopping_mean_1)
train.loc[train.Transported == 0,'ShoppingMall'] = train[train.Transported == 0]['ShoppingMall'].fillna(shopping_mean_0)

In [None]:
test_shopping = test.ShoppingMall.mean()
test.ShoppingMall.fillna(test_shopping,inplace=True)

# Spa

In [None]:
spa_mean_1 = train[train.Transported == 1]['Spa'].mean()
spa_mean_0 = train[train.Transported == 0]['Spa'].mean()
print(spa_mean_0,spa_mean_1)
train.loc[train.Transported == 1,'Spa'] = train[train.Transported == 1]['Spa'].fillna(spa_mean_1)
train.loc[train.Transported == 0,'Spa'] = train[train.Transported == 0]['Spa'].fillna(spa_mean_0)

In [None]:
test_spa = test.Spa.mean()
test.Spa.fillna(test_spa,inplace=True)

# VRDeck

In [None]:
vrdeck_mean_1 = train[train.Transported == 1]['VRDeck'].mean()
vrdeck_mean_0 = train[train.Transported == 0]['VRDeck'].mean()
print(vrdeck_mean_0,vrdeck_mean_1)
train.loc[train.Transported == 1,'VRDeck'] = train[train.Transported == 1]['VRDeck'].fillna(vrdeck_mean_1)
train.loc[train.Transported == 0,'VRDeck'] = train[train.Transported == 0]['VRDeck'].fillna(vrdeck_mean_0)

In [None]:
test_vrdeck = test.VRDeck.mean()
test.VRDeck.fillna(test_vrdeck,inplace=True)

In [None]:
train.drop(['Name','PassengerId'],axis=1,inplace=True)

In [None]:
test.drop(['Name','PassengerId'],axis=1,inplace=True)

In [None]:
train.isna().sum()

In [None]:
train.dropna(inplace=True)

In [None]:
train.describe()

In [None]:
f,ax = plt.subplots(figsize=(15,10))
ax.boxplot([train.Age,train.RoomService,train.FoodCourt,train.ShoppingMall,train.Spa,train.VRDeck])
plt.xticks([1,2,3,4,5,6],[
    'Age','RoomService','FoodCourt','ShoppingMall',	'Spa','VRDeck'
])
plt.legend()
plt.show()

In [None]:
f,ax = plt.subplots(1,1,figsize=(15,10))
sns.histplot(x='Age',hue='Transported',data=train)

In [None]:
# good Dataset
train.Transported.value_counts(normalize=True).plot(kind='bar')
print(train.Transported.value_counts())

In [None]:
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
scaler = MinMaxScaler(feature_range=(0,1))
encoder = LabelEncoder()

In [None]:
train.info()

In [None]:
def fix_miss(data):
    
    data['HomePlanet'].fillna('None', inplace=True)
    data['CryoSleep'].fillna(False, inplace=True)
    data['Cabin'].fillna('A/-1/A', inplace=True)
    data['Destination'].fillna('None', inplace=True)
    data['Age'].fillna(int(train['Age'].mode()), inplace=True)
    data['VIP'].fillna(False, inplace=True)
    data['RoomService'].fillna(0, inplace=True)
    data['FoodCourt'].fillna(0, inplace=True)
    data['ShoppingMall'].fillna(0, inplace=True)
    data['Spa'].fillna(0, inplace=True)
    data['VRDeck'].fillna(0, inplace=True)

In [None]:
fix_miss(train)
train.isna().sum()

In [None]:
fix_miss(test)
test.isna().sum()

In [None]:
def extract_deck(s):
    return s.split('/')[0]

def extract_num(s):
    return s.split('/')[1]

def extract_side(s):
    return s.split('/')[2]

train['Deck'] = train['Cabin'].apply(extract_deck)
train['Num'] = train['Cabin'].apply(extract_num)
train['Side'] = train['Cabin'].apply(extract_side)

test['Deck'] = test['Cabin'].apply(extract_deck)
test['Num'] = test['Cabin'].apply(extract_num)
test['Side'] = test['Cabin'].apply(extract_side)

In [None]:
train

In [None]:
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import Pipeline

In [None]:
class data_prprocessing(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,x,y):
        return self
    def transform(self,x):
        x.HomePlanet = encoder.fit_transform(x.HomePlanet)
        x.Cabin = encoder.fit_transform(x.Cabin)
        x.Deck = encoder.fit_transform(x.Deck)
        x.Side = encoder.fit_transform(x.Side)
        x.Destination = encoder.fit_transform(x.Destination)
        x.VIP = encoder.fit_transform(x.VIP)
        x.CryoSleep = encoder.fit_transform(x.CryoSleep)
        x.Age = scaler.fit_transform(np.array(x.Age).reshape(-1,1))
        x.RoomService = scaler.fit_transform(np.array(x.RoomService).reshape(-1,1))
        x.FoodCourt = scaler.fit_transform(np.array(x.FoodCourt).reshape(-1,1))
        x.ShoppingMall = scaler.fit_transform(np.array(x.ShoppingMall).reshape(-1,1))
        x.Spa = scaler.fit_transform(np.array(x.Spa).reshape(-1,1))
        x.VRDeck = scaler.fit_transform(np.array(x.VRDeck).reshape(-1,1))
        x.Transported = encoder.fit_transform(x.Transported)
        return x

In [None]:
data_pipeline = Pipeline([
    ('data preprocessing',data_prprocessing())
])
data_pipeline

In [None]:
df = data_pipeline.transform(train)
df

In [None]:
corr_data = df[df.keys()]
cmap = plt.cm.PuBu
cols = corr_data.corr().nlargest(len(df.keys()),'Transported')['Transported'].index
cm = np.corrcoef(df[cols].values.T)
f,ax = plt.subplots(figsize=(15,10))
sns.heatmap(cm,annot=True,square=True,vmax=1,linecolor='white',cmap=cmap,
            xticklabels=cols.values,yticklabels=cols.values)

In [None]:
low_corr_feature = ['Destination','VRDeck','Spa','RoomService']

In [None]:
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import recall_score,accuracy_score,log_loss,precision_score,f1_score
from sklearn.model_selection import StratifiedKFold,KFold
from lightgbm import plot_importance,plot_metric

In [None]:
target = df.pop('Transported')
train = df.copy()
print(train.shape,target.shape)

In [None]:
def print_score(y_true,pred):
    score = accuracy_score(y_true,pred)
    F1_score = f1_score(y_true,pred)
    Precision_score = precision_score(y_true,pred)
    Recall_score = recall_score(y_true,pred)
    return print('Accuracy Score:{:.3f}\tF1 Score:{:.3f}\tPrecision Score:{:.3f}\tRecall Score:{:.3f}'.format(
    score,F1_score,Precision_score,Recall_score))

In [None]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=0)
lgb_models = []
fold = 1
for train_idx,valid_idx in skf.split(train,target):
    print(f'Fold:{fold}')
    x_train, x_valid = train.values[train_idx], train.values[valid_idx]
    y_train, y_valid = target.values[train_idx], target.values[valid_idx]
    lgb_model = LGBMClassifier(n_estimators=2000,n_jobs=-1,device='gpu',
                              learning_rate=0.05,reg_lambda=0.9)
    lgb_model.fit(x_train,y_train,eval_set=[(x_valid,y_valid)],early_stopping_rounds=200,verbose=100)
    pred = lgb_model.predict(x_valid)
    print_score(y_valid,pred)
    lgb_models.append(lgb_model)
    fold += 1

In [None]:
# Best Validation Score
lgb_model = lgb_models[2]
lgb_model

In [None]:
plot_metric(lgb_model)

In [None]:
plot_importance(lgb_model,figsize=(15,10))
plt.show()

In [None]:
test

In [None]:
class test_prprocessing(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,x,y):
        return self
    def transform(self,x):
        x.HomePlanet = encoder.fit_transform(x.HomePlanet)
        x.Cabin = encoder.fit_transform(x.Cabin)
        x.Deck = encoder.fit_transform(x.Deck)
        x.Side = encoder.fit_transform(x.Side)
        x.Destination = encoder.fit_transform(x.Destination)
        x.VIP = encoder.fit_transform(x.VIP)
        x.CryoSleep = encoder.fit_transform(x.CryoSleep)
        x.Age = scaler.fit_transform(np.array(x.Age).reshape(-1,1))
        x.RoomService = scaler.fit_transform(np.array(x.RoomService).reshape(-1,1))
        x.FoodCourt = scaler.fit_transform(np.array(x.FoodCourt).reshape(-1,1))
        x.ShoppingMall = scaler.fit_transform(np.array(x.ShoppingMall).reshape(-1,1))
        x.Spa = scaler.fit_transform(np.array(x.Spa).reshape(-1,1))
        x.VRDeck = scaler.fit_transform(np.array(x.VRDeck).reshape(-1,1))
        return x
test_pipeline = Pipeline([
    ('data preprocessing',test_prprocessing())
])
test_pipeline

In [None]:
test = test_pipeline.transform(test)
test

In [None]:
pred = lgb_model.predict(test.values)
pred

In [None]:
prediction = []
for i in pred:
    if i == 0:
        i = False
        prediction.append(i)
    else:
        i = True
        prediction.append(i)
len(prediction)

In [None]:
submission = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
submission.Transported = prediction
submission

In [None]:
submission.to_csv('submission.csv',index=False)