In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import missingno as msno
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from functools import partial
import optuna
from optuna.samplers import TPESampler

from sklearn.metrics import accuracy_score,classification_report

In [None]:
df = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
X_test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
df.head()

In [None]:
df.isna().sum()

In [None]:
cat_cols = df.select_dtypes('object') # numerical columns = 'number'
num_cols = df.select_dtypes(['int','float'])

In [None]:
plt.style.use('default')
plt.figure(figsize=(5,5))
sns.countplot(x ='Sex',data=df)

In [None]:
plt.figure(figsize=(6,5))
sns.countplot(x = df.Embarked.dropna(),data = df,hue = 'Pclass')

In [None]:
cols = ['Age','Fare']

fig = plt.figure()
for i,col in enumerate(cols):
    fig.add_subplot(1,2,i+1)
    fig.set_size_inches(8,4)
    
    sns.histplot(x = df[col],data = df,bins = 30,kde = True)
    
    plt.xlabel(col,fontsize=10)
    plt.tight_layout()

In [None]:
plt.figure()
sns.kdeplot(x = 'Age',hue ='Parch',data = df,fill = True,
            palette = 'Set1',linewidth = 1.8)

In [None]:
df.groupby('Parch')['Fare'].mean().sort_values().plot.barh(figsize=(6,4),color='steelblue')
plt.xlabel('Fare',fontsize = 10)
plt.ylabel('No. of Parent/children',fontsize = 10)
plt.title('Average Fare Analysis for number of parent/children',
          fontsize = 10,x = 0.5,y = 1.05)

In [None]:
plt.figure(figsize=(5,5))
sns.barplot(x = 'Pclass',y ='Fare',data = df)
plt.title('Pclass vs Fare',fontsize = 10)

### Correlation Analysis

In [None]:
corr = df.corr()
sns.heatmap(corr,annot = True,fmt = '0.2f',robust = True)
plt.title('Correlation Analysis')

In [None]:
#df['Fare'] = np.log(df['Fare'])

In [None]:
y = df.Survived
X = df.drop(['PassengerId', 'Name'],axis=1)
X.head(10)

In [None]:
# Impute Age, Fare and Embarked
# Edited from https://www.kaggle.com/antonellomartiello/tpg-features-eng-optuna-lgbm

# Age
dfg = pd.concat([X, X_test], axis=0)
X['Age'] = X['Age'].replace(np.nan, dfg['Age'].median())
X['Age'] = round(X['Age'],1)

# Fare: Assigned median values
dfg.groupby(by=['Sex','Pclass'])['Fare'].median()
X.loc[(X['Fare'].isna()) & (X['Pclass']==1) & (X['Sex']=='female'), 'Fare'] = 85.40
X.loc[(X['Fare'].isna()) & (X['Pclass']==2) & (X['Sex']=='female'), 'Fare'] = 24.75
X.loc[(X['Fare'].isna()) & (X['Pclass']==3) & (X['Sex']=='female'), 'Fare'] = 12.54

X.loc[(X['Fare'].isna()) & (X['Pclass']==1) & (X['Sex']=='male'), 'Fare'] = 64.51
X.loc[(X['Fare'].isna()) & (X['Pclass']==2) & (X['Sex']=='male'), 'Fare'] = 14.23
X.loc[(X['Fare'].isna()) & (X['Pclass']==3) & (X['Sex']=='male'), 'Fare'] = 11.02

# Embarked
X['Embarked'] = X['Embarked'].fillna('S')

msno.bar(X)

In [None]:
# Create dummies for Age, Fare, Port and Pclass
bins = ['Y1', 'Y2', 'M1', 'M2', 'E']
#bins = ['Y1', 'Y2', 'Y3', 'Y4', 'M1', 'M2', 'E']
X['Age_Bin'] = pd.cut(x=X['Age'],
                            bins=[0, 10, 20, 30, 50,1000],
                            labels=bins,right=False)
X['Age_Bin'] = X['Age_Bin'].astype('str')
df_Age_bin = pd.get_dummies(X['Age_Bin'], prefix='Age_bin')
df_Age_bin.head()

bins2 = ['L1', 'L2', 'L3', 'L4']
X['Fare_Bin'] = pd.cut(x=X['Fare'],
                            bins=[0,11, 30, 60, 10000],
                            labels=bins2,right=False)
X['Fare_Bin'] = X['Fare_Bin'].astype('str')
df_Fare_bin = pd.get_dummies(X['Fare_Bin'], prefix='Fare_bin')
df_Fare_bin.head()

df_Embarked = pd.get_dummies(X['Embarked'], prefix='Embark', drop_first=True)
df_Embarked

X['Sex'] = X['Sex'].apply(lambda x: 1 if x=='male' else 0)

X['Pclass'] = X['Pclass'].astype('str')
df_Pclass = pd.get_dummies(X['Pclass'], prefix='class')
df_Pclass.head()

dfg['Cabin'] = dfg['Cabin'].str[0]
dfg['Cabin'].value_counts()
X['Cabin'] = X['Cabin'].str[0]
X['Cabin'] = X['Cabin'].fillna('Z')
X.groupby(by=['Cabin'])['Survived'].mean()
X.loc[(X['Cabin']=='T'), 'Cabin']='Z'
df_cabin = pd.get_dummies(X['Cabin'], prefix='Cabin')

X['Ticket'] = X['Ticket'].str.replace('[^a-zA-Z]', '').str[:1]
X['Ticket'] = X['Ticket'].str.strip()
X['Ticket'] = X['Ticket'].fillna('ZZ')
X.loc[X['Ticket']=='', 'Ticket']='ZZ'
X.loc[X['Ticket']=='L', 'Ticket']='ZZ'
X['Ticket'].value_counts()
df_ticket = pd.get_dummies(X['Ticket'], prefix='ticket')

X['FamilySize'] = X['Parch'] + X['SibSp'] + 1
X['Singleton'] = X['FamilySize'].map(lambda s: 1 if s == 1 else 0)
X['SmallFamily'] = X['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
X['LargeFamily'] = X['FamilySize'].map(lambda s: 1 if 5 <= s else 0)

X = pd.concat([X['SibSp'], X['FamilySize'],X['Sex'], df_Age_bin,df_Fare_bin,
                df_Pclass, df_Embarked, df_cabin, df_ticket, 
                X['Singleton'],X['SmallFamily'],X['LargeFamily']], axis=1)
X.head()

### Splitting the data

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size = 0.25,random_state = 2021)

In [None]:
def score(model,X_train,y_train,X_val,y_val):
    print("Model :",model)
    print('-'*20)
    modell = model()
    modell.fit(X_train,y_train)
    preds = modell.predict(X_val)
    print("Accuracy:",accuracy_score(y_val,preds))
    print("Classification report:\n")
    print(classification_report(y_val,preds))

In [None]:
score(RandomForestClassifier,X_train,y_train,X_val,y_val)

rf = RandomForestClassifier(n_estimators=5)
rf.fit(X_train, y_train)

sorted_idx = rf.feature_importances_.argsort()
plt.barh(X_train.columns[sorted_idx], rf.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")

In [None]:
score(KNeighborsClassifier,X_train,y_train,X_val,y_val)

In [None]:
score(AdaBoostClassifier,X_train,y_train,X_val,y_val)

In [None]:
score(XGBClassifier,X_train,y_train,X_val,y_val)

In [None]:
score(LGBMClassifier,X_train,y_train,X_val,y_val)

### Hyperparameter tuning using optuna

In [None]:
def objective(trial):
    
    num_iterations = trial.suggest_int('num_iterations',50,500)
    max_depth = trial.suggest_int('max_depth',3,10)
    num_leaves = trial.suggest_int('num_leaves',10,30)
    learning_rate = trial.suggest_uniform('learning_rate',0.01,0.2)
    subsample = trial.suggest_uniform('subsample',0.5, 1.0)
    feature_fraction = trial.suggest_uniform('feature fraction',0.5, 1.0)
    min_child_samples = trial.suggest_int('min_child_samples', 1, 110),
    min_child_weight = trial.suggest_loguniform('min_child_weight' , 1e-5 , 1),
    lambda_l2 = trial.suggest_uniform('lambda_l2',1e-5,20)   
   
    model = LGBMClassifier(
            num_iterations = num_iterations,
            max_depth = max_depth,
            num_leaves = num_leaves,
            learning_rate = learning_rate,
            subsample = subsample,
            feature_fraction = feature_fraction,
            min_child_samples = min_child_samples,
            min_child_weight = min_child_weight,
            lambda_l2 = lambda_l2
            )
    
    acc = cross_val_score(model,X_train,y_train,scoring = 'accuracy',cv = 10).mean()
    return acc

In [None]:
sampler = TPESampler(seed=1111)
study = optuna.create_study(direction = 'maximize', sampler=sampler)
study.optimize(objective,n_trials = 1)
print('numbers of the finished trials:' , len(study.trials))
print(study.best_value)
print(study.best_params)

In [None]:
lgbm = LGBMClassifier(num_iterations = 51,
                    max_depth = 9,
                    num_leaves = 21,
                    learning_rate = 0.16026276903571482,
                    subsample = 0.5238932582911326,
                    feature_fraction = 0.5433472981004331,
                    min_child_samples = 83,
                    min_child_weight = 0.00027334704058322765,
                    lambda_l2 = 13.608429722484894
                   )

lgbm.fit(X_train,y_train, eval_metric='logloss', verbose = False)
preds = lgbm.predict(X_val)
print('Accuracy :',accuracy_score(y_val, preds))
print('Classification report:\n')
print(classification_report(y_val,preds))

In [None]:
# Work on the test set
X_test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
X_test.head()

X_test_PId = X_test['PassengerId'] #preserve ids for submission
X_test = X_test.drop(['PassengerId'],axis=1)
X_test.head()

# Impute
X_test['Age'] = X_test['Age'].replace(np.nan, dfg['Age'].median())
X_test['Age'] = round(X_test['Age'],1)

# Fare: Assigned median values and create Fare_bins
#dfg.groupby(by=['Sex','Pclass'])['Fare'].median()
X_test.loc[(X_test['Fare'].isna()) & (X_test['Pclass']==1) & (X_test['Sex']=='female'), 'Fare'] = 85.40
X_test.loc[(X_test['Fare'].isna()) & (X_test['Pclass']==2) & (X_test['Sex']=='female'), 'Fare'] = 24.75
X_test.loc[(X_test['Fare'].isna()) & (X_test['Pclass']==3) & (X_test['Sex']=='female'), 'Fare'] = 12.54

X_test.loc[(X_test['Fare'].isna()) & (X_test['Pclass']==1) & (X_test['Sex']=='male'), 'Fare'] = 64.51
X_test.loc[(X_test['Fare'].isna()) & (X_test['Pclass']==2) & (X_test['Sex']=='male'), 'Fare'] = 14.23
X_test.loc[(X_test['Fare'].isna()) & (X_test['Pclass']==3) & (X_test['Sex']=='male'), 'Fare'] = 11.02

# Embarked
X_test['Embarked'] = X_test['Embarked'].fillna('S')

In [None]:

# Create dummies for Age, Fare, Port and Pclass
bins = ['Y1', 'Y2', 'M1', 'M2', 'E']
X_test['Age_Bin'] = pd.cut(x=X_test['Age'],
                            bins=[0, 16, 20, 30, 50,1000],
                            labels=bins,right=False)
X_test['Age_Bin'] = X_test['Age_Bin'].astype('str')
df_Age_bin = pd.get_dummies(X_test['Age_Bin'], prefix='Age_bin')
df_Age_bin.head()


bins2 = ['L1', 'L2', 'L3', 'L4']
X_test['Fare_Bin'] = pd.cut(x=X_test['Fare'],
                            bins=[0,11, 30, 60, 10000],
                            labels=bins2,right=False)
X_test['Fare_Bin'] = X_test['Fare_Bin'].astype('str')
df_Fare_bin = pd.get_dummies(X_test['Fare_Bin'], prefix='Fare_bin')
df_Fare_bin.head()


df_Embarked = pd.get_dummies(X_test['Embarked'], prefix='Embark', drop_first=True)
df_Embarked

X_test['Sex'] = X_test['Sex'].apply(lambda x: 1 if x=='male' else 0)

X_test['Pclass'] = X_test['Pclass'].astype('str')
df_Pclass = pd.get_dummies(X_test['Pclass'], prefix='class')
df_Pclass.head()

X_test['Cabin'] = X_test['Cabin'].str[0]
X_test['Cabin'] = X_test['Cabin'].fillna('Z')
X_test.loc[(X_test['Cabin']=='T'), 'Cabin']='Z'
df_cabin = pd.get_dummies(X_test['Cabin'], prefix='Cabin')

X_test['Ticket'] = X_test['Ticket'].str.replace('[^a-zA-Z]', '').str[:1]
X_test['Ticket'] = X_test['Ticket'].str.strip()
X_test['Ticket'] = X_test['Ticket'].fillna('ZZ')
X_test.loc[X_test['Ticket']=='', 'Ticket']='ZZ'
X_test.loc[X_test['Ticket']=='L', 'Ticket']='ZZ'
df_ticket = pd.get_dummies(X_test['Ticket'], prefix='ticket')

X_test['FamilySize'] = X_test['Parch'] + X_test['SibSp'] + 1
X_test['Singleton'] = X_test['FamilySize'].map(lambda s: 1 if s == 1 else 0)
X_test['SmallFamily'] = X_test['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
X_test['LargeFamily'] = X_test['FamilySize'].map(lambda s: 1 if 5 <= s else 0)

X_test = pd.concat([X_test['SibSp'], X_test['FamilySize'],X_test['Sex'], df_Age_bin,df_Fare_bin,
                df_Pclass, df_Embarked, df_cabin, df_ticket,
                X_test['Singleton'],X_test['SmallFamily'],X_test['LargeFamily']], axis=1)

X_test.head()

### Predicting on test data

In [None]:
X_test['predictions'] = lgbm.predict(X_test)

### Submission

In [None]:
submission = pd.DataFrame({'PassengerId':X_test_PId,
                         'Survived':X_test['predictions']})
submission.to_csv('my_submission.csv',index = False)
submission.head()