In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import lightgbm
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from tqdm import tqdm
import matplotlib.pyplot as plt 
from sklearn.metrics import accuracy_score,roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv',index_col = 'PassengerId')
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv',index_col = 'PassengerId')
temp = train.append(test,sort=False)

In [None]:
#FillNa
temp['Fare'] = temp['Fare'].fillna(temp['Fare'].mean())
temp['Age'] = temp['Age'].fillna(temp['Age'].mean())
#FE from this notebook: https://www.kaggle.com/belov38/catboost-lb
temp['FirstName'] = temp['Name'].apply(lambda x:x.split(', ')[0])
temp['n'] = 1
gb = temp.groupby('FirstName')
df_names = gb['n'].sum()
temp['SameFirstName'] = temp['FirstName'].apply(lambda x:df_names[x])
temp['SameFirstName'] = temp['SameFirstName'].apply(lambda x:-1 if x>10 else x)
temp = temp.drop(['n','FirstName'],axis = 1)
#FE from this notebook: https://www.kaggle.com/jmargni/tps-apr-2021-lightgbm-cv
temp['Name'] = temp['Name'].map(lambda x: str(x).split(',')[0])
temp['Ticket'] = temp['Ticket'].map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')
temp['Cabin'] = temp['Cabin'].map(lambda x: str(x)[0].strip())
#FE
temp['Family_Size'] = temp['SibSp'] + temp['Parch'] + 1
temp['Alone?'] = False
temp.loc[temp['Family_Size'] == 1, 'Alone?'] = True
temp['Cabin?'] = temp['Cabin'].apply(lambda x: False if str(x[:1]) == 'n' else True)
temp['Age*Fare'] = temp['Age'] * temp['Fare']
temp['Age/Fare'] = temp['Age'] / temp['Fare']

In [None]:
categorical_feature = np.where((temp.dtypes != 'float64')&(temp.dtypes != 'int64')&(temp.dtypes != 'bool'))[0].tolist()
categorical_feature_columns = temp.select_dtypes(exclude=['float64','int64','bool']).columns

In [None]:
#Lazy FE
for column in categorical_feature_columns:
    if column !='Name':
       temp['Name'+'_'+ column] = temp['Name'] + '_' + temp[column]
    if column not in ['Sex','Name']:
       temp['Sex'+'_'+ column] = temp['Sex'] + '_' + temp[column]
    if column not in ['Sex','Name','Ticket']:
       temp['Ticket'+'_'+ column] = temp['Ticket'] + '_' + temp[column]
    if column not in ['Sex','Name','Ticket','Cabin']:
       temp['Cabin'+'_'+ column] = temp['Cabin'] + '_' + temp[column]
    if column not in ['Sex','Name','Ticket','Cabin','Embarked']:
       temp['Embarked'+'_'+ column] = temp['Embarked'] + '_' + temp[column]
        
categorical_feature_columns = temp.select_dtypes(exclude=['float64','int64','bool']).columns

In [None]:
label = LabelEncoder()
for column in categorical_feature_columns:
        label.fit(temp[column])
        temp[column] = label.transform(temp[column])
        
X = temp[:len(train)]
test = temp[len(train):]

In [None]:
test = test.drop(['Survived'],axis = 1)
y = X['Survived']
X = X.drop(['Survived'],axis = 1)

In [None]:
lgbm_parameters = {
    'reg_alpha': 0.00388218567052311,
    'reg_lambda': 8.972335390951376e-05,
    'colsample_bytree': 0.18375780999902297,
    'subsample': 0.013352256062576087,
    'learning_rate': 0.002597839272059483,
    'max_depth': 44,
    'num_leaves': 15,
    'min_child_samples': 89,
    'cat_smooth': 56, 
    'cat_l2': 22.375773634793603,
    'max_bin': 33, 
    'min_data_per_group': 89
}

In [None]:
lgbm_parameters['metric'] = 'binary_logloss'
lgbm_parameters['objective'] = 'binary'
lgbm_parameters['n_estimators'] = 15000

In [None]:
lgbm_test_pred = np.zeros(len(test))
n_splits=10

kf = KFold(n_splits=n_splits, shuffle=True)

lgbm_acc=[]
lgbm_auc=[]

for trn_idx, val_idx in tqdm(kf.split(X,y)):
    x_train_idx = X.iloc[trn_idx]
    y_train_idx = y.iloc[trn_idx]
    x_valid_idx = X.iloc[val_idx]
    y_valid_idx = y.iloc[val_idx]
    
    lgbm_model = LGBMClassifier(**lgbm_parameters)
    lgbm_model.fit(x_train_idx, y_train_idx, eval_set = ((x_valid_idx,y_valid_idx)),verbose = 1000,categorical_feature = categorical_feature,early_stopping_rounds = 1000)  
    lgbm_test_pred += lgbm_model.predict_proba(test)[:,1]/n_splits
    lgbm_auc.append(roc_auc_score(y_valid_idx, lgbm_model.predict_proba(x_valid_idx)[:,1]))
    lgbm_acc.append(accuracy_score(y_valid_idx,(lgbm_model.predict_proba(x_valid_idx)[:,1] > 0.5).astype(int)))

print(f'AUC: {np.mean(lgbm_auc)}')
print(f'ACC: {np.mean(lgbm_acc)}')

In [None]:
submission = pd.DataFrame({'PassengerId':test.index,'Survived':lgbm_test_pred})
submission['Survived'] = (submission['Survived'] > 0.5).astype(int)
submission.to_csv('submission.csv', index=False)

In [None]:
plt.rcParams["figure.figsize"] = (6, 5)
lightgbm.plot_importance(lgbm_model,max_num_features = 16,height=.9)