In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Utils

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, roc_auc_score, make_scorer, accuracy_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from kmodes.kmodes import KModes
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import StratifiedKFold
import lightgbm
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_uniform
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.metrics.pairwise  import cosine_similarity
from sklearn.metrics.pairwise import chi2_kernel
from sklearn.feature_selection import SelectFromModel

In [None]:
pd.set_option('display.max_rows', None)

## Load Data

In [None]:
train= pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv', sep=',')
sub_sample = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv', sep=',')
test= pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv', sep=',')

In [None]:
test.info()

In [None]:
print(train.shape, test.shape, sub_sample.shape)

In [None]:
train.info()

In [None]:
train.head()

In [None]:
train.isna().sum()

In [None]:
train.describe()

### Data Preprocessing

In [None]:
train = train.set_index('PassengerId')

In [None]:
#Df with train + test data
dfg = pd.concat([train, test], axis=0)

using the Median to fill all the NAN for Age

In [None]:
dfg['Age'].median()

In [None]:
train['Age'] = train['Age'].replace(np.nan, dfg['Age'].median())

In [None]:
train['Age'] = round(train['Age'],0)

#Create a complex Feature that includes Age_bin, Sex

In [None]:
bins = ['Y1', 'Y2', 'Y3', 'Y4', 'M1', 'M2', 'E']
train['Age_Bin'] = pd.cut(x=train['Age'],
                            bins=[0, 5 , 10, 15, 20, 30, 50,1000],
                            labels=bins,right=False)
train['Age_Bin'] = train['Age_Bin'].astype('str')
train['Age_Bin'] = train['Age_Bin']+train['Sex']
df_Age_bin = pd.get_dummies(train['Age_Bin'], prefix='Age_bin')

Handle missing data in "Fare" field

In [None]:
dfg['Age_Bin'] = pd.cut(x=dfg['Age'],
                            bins=[0, 5 , 10, 15, 20, 30, 50,1000],
                            labels=bins,right=False)
dfg.groupby(by=['Sex','Pclass'])['Fare'].median()

In [None]:
train.loc[(train['Fare'].isna()) & (train['Pclass']==1) & (train['Sex']=='female'), 'Fare']=85.40
train.loc[(train['Fare'].isna()) & (train['Pclass']==2) & (train['Sex']=='female'), 'Fare']=24.75
train.loc[(train['Fare'].isna()) & (train['Pclass']==3) & (train['Sex']=='female'), 'Fare']=12.54

train.loc[(train['Fare'].isna()) & (train['Pclass']==1) & (train['Sex']=='male'), 'Fare']=64.51
train.loc[(train['Fare'].isna()) & (train['Pclass']==2) & (train['Sex']=='male'), 'Fare']=14.23
train.loc[(train['Fare'].isna()) & (train['Pclass']==3) & (train['Sex']=='male'), 'Fare']=11.02

In [None]:
bins2 = ['L1', 'L2', 'L3', 'L4']
train['Fare_Bin'] = pd.cut(x=train['Fare'],
                            bins=[0,11, 30 , 60, 10000],
                            labels=bins2,right=False)

In [None]:
train['Fare_Bin'] = train['Fare_Bin'].astype('str')
df_Fare_bin = pd.get_dummies(train['Fare_Bin'], prefix='Fare_bin')
df_Fare_bin.head()

For Cabin Feature, estract only the first letter, then fillna with Z

In [None]:
dfg['Cabin'] =dfg['Cabin'].str[0]

In [None]:
dfg['Cabin'].value_counts()

In [None]:
train['Cabin'] =train['Cabin'].str[0]

In [None]:
train['Cabin'] = train['Cabin'].fillna('Z')

In [None]:
train.groupby(by=['Cabin'])['Survived'].mean()

Remove T because of few instances

In [None]:
train.loc[(train['Cabin']=='T'), 'Cabin']='Z'

In [None]:
df_cabin = pd.get_dummies(train['Cabin'], prefix='Cabin')

Analysis of possible strategy to fillna for Embarked field.
Since the Passenger Class seems to be correlated to the possibility to Survive. 3 different embark classes will be created to fillna

In [None]:
train.groupby(by=['Embarked'])[['Fare','Survived']].mean()

In [None]:
train[train['Embarked'].isna()].groupby(by=['Pclass'])['Survived'].mean()

In [None]:
#Used the most frequent caracter
train['Embarked'] = train['Embarked'].fillna('S')

In [None]:
df_embarked = pd.get_dummies(train['Embarked'], prefix='Embark')

Keep only the first letter for ticket and replace space with N and nan wi1th ZZ

In [None]:
train['Ticket'] = train['Ticket'].str.replace('[^a-zA-Z]', '').str[:1]
train['Ticket'] = train['Ticket'].str.strip()

Convert NAN to ZZ

In [None]:
train['Ticket'] = train['Ticket'].fillna('ZZ')

Convert '' with NN

In [None]:
train.loc[train['Ticket']=='', 'Ticket']='ZZ'

Removed L because of few instances

In [None]:
train.loc[train['Ticket']=='L', 'Ticket']='ZZ'

In [None]:
train.groupby(by=['Ticket'])['Survived'].mean()

In [None]:
train['Ticket'].value_counts()

In [None]:
df_tiket = pd.get_dummies(train['Ticket'], prefix='ticket')

Handling "Name"

In [None]:
df_name = pd.concat([train['Name'], test['Name']], axis=0)
df_name = pd.DataFrame(df_name, columns=['Name'])

In [None]:
df_name['FirstName'] = df_name['Name'].apply(lambda x:x.split(', ')[0])
df_name['SecondName'] = df_name['Name'].str.split(', ', 1, expand=True)[1]

In [None]:
le = LabelEncoder()
le1 = LabelEncoder()
df_name['FirstName'] = le.fit_transform(df_name['FirstName'])
df_name['SecondName'] = le1.fit_transform(df_name['SecondName'])

In [None]:
train['FirstName'] = train['Name'].apply(lambda x:x.split(', ')[0])
train['SecondName'] = train['Name'].str.split(', ', 1, expand=True)[1]

In [None]:
train['FirstName'] = le.transform(train['FirstName'])
train['SecondName'] = le1.transform(train['SecondName'])

Encoding the sex feature

In [None]:
train['Sex'] = train['Sex'].apply(lambda x: 1 if x=='female' else 0)

In [None]:
train['Sex'].value_counts()

One Hot encoder for Pclass

In [None]:
train['Pclass'] = train['Pclass'].astype('str')
df_pclass = pd.get_dummies(train['Pclass'], prefix='class')

In [None]:
df_pclass.head()

Get the Fatures related to Family size. An idea taken from:
https://medium.datadriveninvestor.com/start-with-kaggle-a-comprehensive-guide-to-solve-the-titanic-challenge-8ac5815b0473

In [None]:
# introducing a new feature : the size of families (including the passenger)
train['FamilySize'] = train['Parch'] + train['SibSp'] + 1

In [None]:
# introducing other features based on the family size
train['Singleton'] = train['FamilySize'].map(lambda s: 1 if s == 1 else 0)
train['SmallFamily'] = train['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
train['LargeFamily'] = train['FamilySize'].map(lambda s: 1 if 5 <= s else 0)

In [None]:
df = pd.concat([train['Fare'], train['Age'],train['FamilySize'], train['Singleton'], train['SmallFamily'], train['LargeFamily'],train['Sex'], df_cabin,df_tiket, df_pclass, df_embarked ,train['FirstName'],train['SecondName'],df_Age_bin,df_Fare_bin,train['Survived']], axis=1)

In [None]:
df.columns

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(data=df.corr())

In [None]:
df = df.drop(columns='Survived')

Adding 3 more features with Kmeans

In [None]:
km = KMeans(n_clusters=3, random_state=22, n_init=20)
df_km = km.fit_predict(df)
df_km = pd.DataFrame(df_km, index=df.index)
df_km = df_km.astype('str')
df_km = pd.get_dummies(df_km)

In [None]:
df_km.head()

In [None]:
df = pd.concat([df, df_km], axis=1)

In [None]:
df_target = train['Survived']

In [None]:
clf = RandomForestClassifier(n_estimators=200, max_features='sqrt')
clf = clf.fit(df, df_target)

In [None]:
features = pd.DataFrame()
features['feature'] = df.columns
features['importance'] = clf.feature_importances_
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)

features.plot(kind='barh', figsize=(25, 25))

# Using Optuna with Lgbm

In [None]:
import optuna

In [None]:
def objective(trial , data = df , target = df_target):
    train_x , test_x , train_y , test_y = train_test_split(data , target , \
            test_size = 0.028059109276941666 , random_state = 2)

    #test_size = 0.028059109276941666
    params = {
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 1e-5 , 12),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 1e-5 , 12),
        'num_leaves' : trial.suggest_int('num_leaves' , 11 , 900),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0.0000001 , 0.2),
        'max_depth' : trial.suggest_int('max_depth' , 5 , 400),
        'n_estimators' : trial.suggest_int('n_estimators' , 1 , 9999),
        'min_child_samples' : trial.suggest_int('min_child_samples' , 1 , 110),
        'min_child_weight' : trial.suggest_loguniform('min_child_weight' , 1e-5 , 1),
        'subsample' : trial.suggest_uniform('subsample' , 1e-5 , 1.0),
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree' , 1e-5 , 1),
        'random_state' : trial.suggest_categorical('random_state' , [2,22,222,2222]),
        'metric' : 'accuracy',
        'device_type' : 'cpu',
    }
    model = lightgbm.LGBMClassifier(**params)
    model.fit(train_x , train_y , eval_set = [(test_x , test_y)] ,eval_metric='logloss', early_stopping_rounds = 3000 , \
             verbose = False)
    preds = model.predict(test_x)
    acc = accuracy_score(test_y , preds)
    return acc

In [None]:
study = optuna.create_study(direction = 'maximize' , study_name = 'lgbm')
study.optimize(objective , n_trials = 1)
print('numbers of the finished trials:' , len(study.trials))
print('the best params:' , study.best_trial.params)
print('the best value:' , study.best_value)

In [None]:
#the best value: 0.7808267997148967
params= {'reg_alpha': 0.000493095633250276, 'reg_lambda': 0.2799468729577344, 'num_leaves': 220, 'learning_rate': 0.058683299033376934, 'max_depth': 97, 'n_estimators': 9161, 'min_child_samples': 108, 'min_child_weight': 1.7359084365325016e-05, 'subsample': 0.7381682823837273, 'colsample_bytree': 0.29845810314125426, 'random_state': 1509}

In [None]:
#the best value: 0.7811831789023521
params2 = {'reg_alpha': 0.02242367265240423, 'reg_lambda': 0.0006085533155144086, 'num_leaves': 238, 'learning_rate': 0.03240605916351265, 'max_depth': 65, 'n_estimators': 5361, 'min_child_samples': 27, 'min_child_weight': 0.00011308353926700071, 'subsample': 0.5688435861948473, 'colsample_bytree': 0.06746586089945723, 'random_state': 22}

In [None]:
#the best value: 0.7804704205274412
params1= {'reg_alpha': 0.009415444471348289, 'reg_lambda': 1.2556528225033043, 'num_leaves': 25, 'learning_rate': 0.00835886426230468, 'max_depth': 230, 'n_estimators': 3653, 'min_child_samples': 9, 'min_child_weight': 0.0002224399318225647, 'subsample': 0.9780174338845454, 'colsample_bytree': 0.7969641118752326, 'random_state': 1}

## Preprocessing the Test_set

In [None]:
test = test.set_index('PassengerId')

#Age
test['Age'] = test['Age'].replace(np.nan, dfg['Age'].median())
test['Age_Bin'] = pd.cut(x=test['Age'],
                            bins=[0, 5 , 10, 15, 20, 30, 50,1000],
                            labels=bins,right=False)
test['Age_Bin'] = test['Age_Bin'].astype('str')
test['Age_Bin'] = test['Age_Bin']+test['Sex']
dft_Age_bin = pd.get_dummies(test['Age_Bin'], prefix='Age_bin')


#Fare
test.loc[(test['Fare'].isna()) & (test['Pclass']==1) & (test['Sex']=='female'), 'Fare']=85.40
test.loc[(test['Fare'].isna()) & (test['Pclass']==2) & (test['Sex']=='female'), 'Fare']=24.75
test.loc[(test['Fare'].isna()) & (test['Pclass']==3) & (test['Sex']=='female'), 'Fare']=12.54

test.loc[(test['Fare'].isna()) & (test['Pclass']==1) & (test['Sex']=='male'), 'Fare']=64.51
test.loc[(test['Fare'].isna()) & (test['Pclass']==2) & (test['Sex']=='male'), 'Fare']=14.23
test.loc[(test['Fare'].isna()) & (test['Pclass']==3) & (test['Sex']=='male'), 'Fare']=11.02

test['Fare_Bin'] = pd.cut(x=test['Fare'],
                            bins=[0,11, 30 , 60, 10000],
                            labels=bins2,right=False)

test['Fare_Bin'] = test['Fare_Bin'].astype('str')
dft_Fare_bin = pd.get_dummies(test['Fare_Bin'], prefix='Fare_bin')


#Cabin
test['Cabin'] =test['Cabin'].str[0]
test['Cabin'] = test['Cabin'].fillna('Z')
test.loc[(test['Cabin']=='T'), 'Cabin']='Z'
dft_cabin = pd.get_dummies(test['Cabin'], prefix='Cabin')

#Embarked
test['Embarked'] = test['Embarked'].fillna('S')
dft_embarked = pd.get_dummies(test['Embarked'], prefix='Embark')

#Ticket
test['Ticket'] = test['Ticket'].str.replace('[^a-zA-Z]', '').str[:1]
test['Ticket'] = test['Ticket'].str.strip()
test['Ticket'] = test['Ticket'].fillna('ZZ')
test.loc[test['Ticket']=='', 'Ticket']='ZZ'
test.loc[test['Ticket']=='L', 'Ticket']='ZZ'
dft_tiket = pd.get_dummies(test['Ticket'], prefix='ticket')

#Name
test['FirstName'] = test['Name'].apply(lambda x:x.split(', ')[0])
test['SecondName'] = test['Name'].str.split(', ', 1, expand=True)[1]

test['FirstName'] = le.transform(test['FirstName'])
test['SecondName'] = le1.transform(test['SecondName'])

#Sex
test['Sex'] = test['Sex'].apply(lambda x: 1 if x=='female' else 0)

#Pclass
test['Pclass'] = test['Pclass'].astype('str')
dft_pclass = pd.get_dummies(test['Pclass'], prefix='class')

#Family Size
test['FamilySize'] = test['Parch'] + test['SibSp'] + 1

test['Singleton'] = test['FamilySize'].map(lambda s: 1 if s == 1 else 0)
test['SmallFamily'] = test['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
test['LargeFamily'] = test['FamilySize'].map(lambda s: 1 if 5 <= s else 0)

In [None]:
dft = pd.concat([test['Fare'], test['Age'],test['FamilySize'], test['Singleton'], test['SmallFamily'], test['LargeFamily'], test['Sex'],dft_cabin,dft_tiket, dft_pclass, dft_embarked, test['FirstName'],test['SecondName'], dft_Age_bin, dft_Fare_bin], axis=1)

In [None]:
dft_km = km.predict(dft)
dft_km = pd.DataFrame(dft_km, index=dft.index)
dft_km = dft_km.astype('str')
dft_km = pd.get_dummies(dft_km)

In [None]:
dft = pd.concat([dft, dft_km], axis=1)

In [None]:
len(dft.columns)

In [None]:
len(df.columns)

In [None]:
list(set(df.columns)-set(dft.columns))

In [None]:
df_target.head()

In [None]:
params1['metric'] = 'accuracy'
params1['device'] = 'cpu'
preds = np.zeros(dft.shape[0])
oof_preds = np.zeros(df.shape[0])
kf = StratifiedKFold(n_splits = 50 , random_state = 22 , shuffle = True)
roc = []
n = 0
for trn_idx , val_idx in kf.split(df , df_target):
    train_x = df.iloc[trn_idx]
    train_y = df_target.iloc[trn_idx]
    val_x = df.iloc[val_idx]
    val_y = df_target.iloc[val_idx]
    
    model = lightgbm.LGBMClassifier(**params1)
    model.fit(train_x , train_y , eval_set = [(val_x , val_y)] ,eval_metric='logloss', early_stopping_rounds = 8000 , verbose = False)
    clf = CalibratedClassifierCV(model, cv='prefit', method='sigmoid')
    clf.fit(train_x , train_y)
    preds += clf.predict_proba(dft)[:,1]/kf.n_splits
    oof_preds += clf.predict_proba(df)[:,1]/kf.n_splits
    roc.append(accuracy_score(val_y , clf.predict(val_x)))
    fpr, tpr, thresholds = roc_curve(val_y , clf.predict_proba(val_x)[:,1])
    gmeans = np.sqrt(tpr * (1-fpr))
    ix = np.argmax(gmeans)
    print(n+1 , roc[n], 'Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))
    n+=1

In [None]:
fpr, tpr, thresholds = roc_curve(df_target , oof_preds)
gmeans = np.sqrt(tpr * (1-fpr))
ix = np.argmax(gmeans)
thresholds[ix]

In [None]:
sub_sample['Survived'] = preds

In [None]:
#simple threshold 0.405749
sub_sample['Survived'] = sub_sample['Survived'].apply(lambda x: 1 if x>0.43147162440098286 else 0)

In [None]:
sub_sample.to_csv('submission.csv',index=False)