# Spaceship Titanic LGBM 0224

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [None]:
paths=[]
names=[]
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        paths+=[os.path.join(dirname, filename)]
        names+=[filename[-4:]]

In [None]:
for i in range(len(paths)):
    names[i]=pd.read_csv(paths[i],encoding='cp932')
    print(str(i)+'. '+paths[i].split('/')[-1][0:-4])
    display(names[i])
    print()

In [None]:
train0=names[1].copy()
test0=names[2].copy()
subm=names[0].copy()

In [None]:
Name0=train0['Transported'].unique()
Name=sorted(Name0)
print(Name)

In [None]:
N=list(range(len(Name)))
normal_mapping=dict(zip(Name,N)) 
reverse_mapping=dict(zip(N,Name)) 

In [None]:
train0['Transported']=train0['Transported'].map(normal_mapping)
train0['CryoSleep']=train0['CryoSleep'].map(normal_mapping)
test0['CryoSleep']=test0['CryoSleep'].map(normal_mapping)
train0['VIP']=train0['VIP'].map(normal_mapping)
test0['VIP']=test0['VIP'].map(normal_mapping)
train0.info()

In [None]:
from sklearn.preprocessing import LabelEncoder
def labelencoder(df):
    for c in df.columns:
        if df[c].dtype=='object': 
            df[c] = df[c].fillna('N')
            lbl = LabelEncoder()
            lbl.fit(list(df[c].values))
            df[c] = lbl.transform(df[c].values)
    return df

In [None]:
train=labelencoder(train0)
test=labelencoder(test0)
train.info()

In [None]:
import lightgbm as lgb
import random
import optuna
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

In [None]:
target = train['Transported']
data = train.drop('Transported',axis=1)

In [None]:
columns=data.columns.to_list()
print(columns)

In [None]:
def objective(trial,data=data,target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param =   {
        'num_leaves': trial.suggest_int('num_leaves', 10, 300),
        'objective': trial.suggest_categorical('objective',['rmse','binary']),  
        'max_depth': -1,
        'learning_rate': 0.1,
        "boosting": "gbdt",
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        "bagging_freq": 5,
        "bagging_fraction": trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        "feature_fraction": trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        "verbosity": -1,
    }
    model = lgb.LGBMRegressor(**param)      
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    preds = model.predict(test_x)
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
# shows the scores from all trials
optuna.visualization.plot_optimization_history(study)

In [None]:
# interactively visualizes the hyperparameters and scores
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
# shows the evolution of the search
optuna.visualization.plot_slice(study)

In [None]:
# Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [None]:
# Visualize empirical distribution function
optuna.visualization.plot_edf(study)

In [None]:
Best_trial=study.best_trial.params
print(Best_trial)

In [None]:
print(test.shape)

In [None]:
preds = np.zeros((test.shape[0]))
kf = KFold(n_splits=5,random_state=48,shuffle=True)
for trn_idx, test_idx in kf.split(train[columns],target):
    X_tr,X_val=train[columns].iloc[trn_idx],train[columns].iloc[test_idx]
    y_tr,y_val=target.iloc[trn_idx],target.iloc[test_idx]
    model = lgb.LGBMRegressor(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    preds+=model.predict(test[columns])/kf.n_splits
    rmse=mean_squared_error(y_val, model.predict(X_val),squared=False)
    print(rmse)

In [None]:
preds2=[]
for item in preds:
    value=np.where(item<0.5,0,1)
    preds2+=[reverse_mapping[int(value)]]

In [None]:
subm['Transported'] = preds2
subm.to_csv('submission.csv',index=False)
subm