In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')
train.head()

In [None]:
train.shape, test.shape

In [None]:
train.info()

In [None]:
train.nunique()

### EDA

In [None]:
groups_train = {}
for value in train['PassengerId'].str.split('_').str[0]:
    groups_train[value] = groups_train.get(value, 0) + 1
    
groups_test = {}
for value in test['PassengerId'].str.split('_').str[0]:
    groups_test[value] = groups_test.get(value, 0) + 1

In [None]:
train_number = []
test_number = []

for index, value in train['PassengerId'].items():
    train_number.append(groups_train[value.split('_')[0]])
for index, value in test['PassengerId'].items():
    test_number.append(groups_test[value.split('_')[0]])

train['GroupSize'] = train_number
test['GroupSize'] = test_number

    

In [None]:
train[['Deck', 'Num','Side']] = train['Cabin'].str.split('/', expand=True)
test[['Deck', 'Num','Side']] = test['Cabin'].str.split('/', expand=True)

In [None]:
train.drop(['PassengerId', 'Cabin', 'Name'], axis=1, inplace=True)
test.drop(['PassengerId', 'Cabin', 'Name'], axis=1, inplace=True)

In [None]:
cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [None]:
for col in num_cols:     
    train[col].fillna(train[col].mean(), inplace=True)
    test[col].fillna(test[col].mean(), inplace=True)
for col in cat_cols:
    train[col].fillna(train[col].mode()[0], inplace=True)
    test[col].fillna(test[col].mode()[0], inplace=True)

In [None]:
d = {'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'T':7}

train['Deck'].replace(d, inplace=True)
test['Deck'].replace(d, inplace=True)

In [None]:
train['Num'] = train['Num'].astype(float)
test['Num'] = test['Num'].astype(float)

In [None]:
train=pd.get_dummies(train,prefix_sep='__')
test=pd.get_dummies(test,prefix_sep='__')
train.head()

In [None]:
train[['CryoSleep', 'VIP', 'Transported']] = train[['CryoSleep', 'VIP', 'Transported']].astype(int)
test[['CryoSleep', 'VIP']] = train[['CryoSleep', 'VIP']].astype(int)

In [None]:
train.info()

### Modeling

In [None]:
y = train['Transported']
X = train.drop(['Transported'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.3, random_state=42)

In [None]:
X_train.shape, X_test.shape

In [None]:
from lightgbm import LGBMClassifier
from lightgbm import log_evaluation, early_stopping

clf = LGBMClassifier(objective='binary',
                     learning_rate=0.01,
                     num_iterations=700,
                     max_depth=7
                     )
callbacks = [log_evaluation(period=30)]
clf.fit(X_train, y_train, 
        eval_set=(X_train, y_train),
        eval_names='train',
        eval_metric='auc',
        callbacks=callbacks
       )

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

clf_pred = clf.predict_proba(X_test)
print('roc_auc:', roc_auc_score(y_test, clf_pred[:, 1]))
print('log_loss:', log_loss(y_test, clf_pred[:, 1]))
print('accuracy:', accuracy_score(y_test, clf_pred[:, 1].round()))

In [None]:
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

feature_imp = pd.DataFrame(sorted(zip(clf.feature_importances_,X.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features')
plt.tight_layout()

In [None]:
X_train['Deck'].fillna(X_train['Deck'].mode()[0], inplace=True)
X_test['Deck'].fillna(X_test['Deck'].mode()[0], inplace=True)
X_train['Num'].fillna(X_train['Num'].median(), inplace=True)
X_test['Num'].fillna(X_test['Num'].median(), inplace=True)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=8, random_state=0, n_estimators=100)
rfc.fit(X_train, y_train)

In [None]:
rfc_pred = rfc.predict_proba(X_test)
print('roc_auc:', roc_auc_score(y_test, rfc_pred[:, 1]))
print('log_loss:', log_loss(y_test, rfc_pred[:, 1]))
print('accuracy:', accuracy_score(y_test, rfc_pred[:, 1].round()))

In [None]:
feature_imp = pd.DataFrame(sorted(zip(rfc.feature_importances_,X.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('RFC Features')
plt.tight_layout()

In [None]:
sample = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
sample['Transported'] = clf.predict_proba(test)*0.5
test['Num'].fillna(test['Num'].median(), inplace=True)
test['Deck'].fillna(test['Deck'].mode()[0], inplace=True)
sample['Transported'] += rfc.predict_proba(test)[:,1]*0.5
sample['Transported'] = np.round(sample['Transported']).astype(bool)
sample.to_csv('submission.csv', index=False)
sample.head()