In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [None]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv', index_col = 'id')
train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv', index_col = 'id')
test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv', index_col = 'id')

In [None]:
print('Number of NA values in train data is {}'.format(train.isna().sum().sum()))
print('Number of NA values in test data is {}'.format(test.isna().sum().sum()))

In [None]:
X = train.copy()
target = X.pop('target').copy()

In [None]:
train.head()

In [None]:
train.info()

In [None]:
X = train.copy()
y = X.pop('target').copy()

In [None]:
cat_features = [col for col in X.select_dtypes('object')]
num_features = [col for col in X.select_dtypes(['float64'])]

# categorical features exploration

In [None]:
#count unique values for each cat_col in train dataset
#There are four features (cat5, cat7, cat8, cat10) with high cardinality (>20 , one very high with 299)

for col in train[cat_features]:
    print(col, train[col].nunique())

In [None]:
#count unique values for each cat_col in test dataset
#There are four features (cat5, cat7, cat8, cat10) with high cardinality (>20 , one very high with 295)

for col in test[cat_features]:
    print(col, test[col].nunique())

# #the nunique values were different for some features between train and test

In [None]:
# group columns according to cardinality
low_cardinality_col = []
high_cardinality_col = []

for col in train[cat_features]:
    if train[col].nunique() <= 20:
        low_cardinality_col.append(col)
    else:
        high_cardinality_col.append(col)

In [None]:
print(f"low_cardinality_col:", low_cardinality_col)
print(f"high_cardinality_col:", high_cardinality_col)

In [None]:
def cat_feature_distribution(df):
    for col in cat_features:
        dfg = df.groupby(col).agg(freq=(col, lambda x :x.count())).sort_values('freq',ascending = False)
        dfg.plot(kind = 'bar', figsize = (20,8))
        plt.ylabel('Frequency of Taregt', fontsize = 20)
        plt.title('{}'.format(col), fontsize =20)

In [None]:
#check cat_features distribution in train dataset
cat_feature_distribution(train)

In [None]:
#check cat_features distribution in test dataset
cat_feature_distribution(test)

In [None]:
#check cat_features between train and test
#combine both datasets
train['source'] = 'train'
test['source'] = 'test'
dfm = pd.concat([train, test], axis = 0)

In [None]:
dfm.shape

In [None]:
#check cat_features diffence between train and test
for col in cat_features:
    dfg = dfm.groupby(['source',col]).agg(freq=(col, lambda x :x.count())).sort_values('freq',ascending = False).reset_index()
    plt.figure(figsize = (20,8))
    sns.barplot(x=col, y ='freq', hue= 'source', data = dfg)
    plt.title('The frequency distribution for {} between train and test'.format(col), fontsize = 30)
    plt.ylabel('Freqency')
    

## cat1: combine D,E as other
## cat2: combine N,H,B,S,U,R,K,E as other
## cat3: combine K,G,L,J,H,I,N as other
## cat4: keep E,F,G,D,H,J,I,K,M
## cat5: keep BI, AB, BU,K,G,BQ,N,CL,AL,BO,AY
## cat6: combine Q,W as other
## cat9: combine W,O,U,X,S as other

# numerial features

In [None]:
#distribution for num_feature
train[num_features].describe()

In [None]:
#histograph for num_feature
train[num_features].hist(bins = 10, figsize = (20,15));
#sns.distplot(train[num_features], kde= True)

In [None]:
#correlation matrix
def corrplot(df, method="pearson", annot=True, **kwargs):
    sns.clustermap(
        df.corr(method),
        vmin=-1.0,
        vmax=1.0,
        cmap="icefire",
        method="complete",
        annot=annot,
        **kwargs,
    )



In [None]:
corrplot(train[num_features], annot = True)

In [None]:
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

In [None]:
def boxplot_plot(df):
    plt.figure(figsize = (20, 8))
    sns.boxplot(x = 'variable', y = 'value', data = pd.melt(df[num_features]))
    plt.xlabel('Num_features', fontsize = 20)
    plt.ylabel('Value', fontsize = 20)
    plt.title('Boxplot for numerical features in {} dataset'.format(get_df_name(df)), fontsize = 30)
    

In [None]:
#boxplot for train dataset
boxplot_plot(train)

In [None]:
#boxplot for test dataset
boxplot_plot(test)

# target variable

In [None]:
plt.figure(figsize = (10,6))
ax = sns.countplot(x= target, data = train)
ax.set_title("Target distrubution", fontsize = 20, y = 1.05);

## feature engineer
### cat1: combine D,E as other
### cat2: combine N,H,B,S,U,R,K,E as other
### cat3: combine K,G,L,J,H,I,N as other
### cat4: keep E,F,G,D,H,J,I,K,M
### cat5: keep BI, AB, BU,K,G,BQ,N,CL,AL,BO,AY
### cat6: combine Q,W as other
### cat9: combine W,O,U,X,S as other

In [None]:
dfm['cat1'] =  dfm['cat1'].apply(lambda x : x if x not in ['D', 'E'] else 'other')
dfm['cat2'] =  dfm['cat2'].apply(lambda x : x if x not in ['N','H','B','S','U','R','K','E'] else 'other')
dfm['cat3'] =  dfm['cat3'].apply(lambda x : x if x not in ['K','G','L','J','H','I','N'] else 'other')
dfm['cat4'] =  dfm['cat4'].apply(lambda x : x if x not in ['E','F','G','D','H','J','I','K','M'] else 'other')
dfm['cat5'] =  dfm['cat5'].apply(lambda x : x if x in ['BI', 'AB', 'BU','K','G','BQ','N','CL','AL','BO','AY'] else 'other')
dfm['cat6'] =  dfm['cat6'].apply(lambda x : x if x not in ['W', 'Q'] else 'other')
dfm['cat9'] =  dfm['cat9'].apply(lambda x : x if x not in ['W','O','U','X','S'] else 'other')

In [None]:
#create a function to combine less frequence < 1%  as other in cat10 
def less_freq_other(df):
    threshold_percent = 1
    series = pd.value_counts(df['cat10'])
    mask = (series / series.sum() *100).lt(threshold_percent)
    #df['cat10'] = np.where(df['cat10'].isin(series[mask].index), 'other', df['cat10'])
    df.loc[df['cat10'].isin(series[mask].index.tolist()), 'cat10'] = 'other'

In [None]:
less_freq_other(dfm)

In [None]:
dfm.head()

 # data preparation and modeling



In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, make_scorer
from sklearn.linear_model import LogisticRegression

In [None]:
df1 = dfm.drop('source', axis =1).copy()

In [None]:
#label encode categorical features
for col in cat_features:
    le = LabelEncoder()
    df1[col] = le.fit_transform(df1[col])
    label_train = df1.iloc[:len(train), :]
    label_test = df1.iloc[len(train):, :]
    label_test = label_test.drop('target', axis =1)

In [None]:
print('label_train:', label_train.shape)
print('label_test:', label_test.shape)

In [None]:
x= label_train.drop('target', axis = 1)
y = label_train['target']
x_train,  x_valid, y_train, y_valid = train_test_split(x,y,test_size = 0.2, random_state = 0)

In [None]:
#xgboost
xgb = XGBClassifier()

xgb.fit(x_train, y_train, verbose=False)
predictions = xgb.predict_proba(x_valid)[:,1]

auc = roc_auc_score(y_valid, predictions)

print(f'Baseline Score: {auc}')

In [None]:
#LGBM
lgbm = LGBMClassifier()

lgbm.fit(x_train, y_train, eval_set=(x_valid,y_valid), early_stopping_rounds=150, verbose=False)
predictions = lgbm.predict_proba(x_valid)[:,1]

auc = roc_auc_score(y_valid, predictions)

print(f'Baseline Score: {auc}')

In [None]:
#catboost
cat_model = CatBoostClassifier(verbose=0,
                                eval_metric="AUC",
                                random_state=42)
cat_model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)])

cat_pred = cat_model.predict_proba(x_valid)[:,1]
cat_auc = roc_auc_score(y_valid, cat_pred)
print(f'AUC score for catboost: {cat_auc}')

In [None]:
#random forest
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(x_train, y_train)
rf_pred = rf.predict_proba(x_valid)[:,1]
rf_auc = roc_auc_score(y_valid, rf_pred)
print(f'AUC score for randomforest: {rf_auc}')

In [None]:
#logisticregression
lg = LogisticRegression(solver='liblinear')
lg.fit(x_train, y_train)
lg_pred = lg.predict_proba(x_valid)[:,1]
lg_auc = roc_auc_score(y_valid, cat_pred)
print(f'AUC score for Logistic Regression: {lg_auc}')

## among all models, catboost and logisticregression performs best with highest AUC

# model tuning

In [None]:
clf = CatBoostClassifier()
params = {'iterations': [500],
          'depth': [4, 5, 6],
          'loss_function': ['Logloss', 'CrossEntropy'],
          'l2_leaf_reg': np.logspace(-20, -19, 3),
          'leaf_estimation_iterations': [10],
#           'eval_metric': ['Accuracy'],
#           'use_best_model': ['True'],
          'logging_level':['Silent'],
          'random_seed': [42]
         }
scorer = make_scorer(accuracy_score)
clf_grid = GridSearchCV(estimator=clf, param_grid=params, scoring=scorer, cv=5)


In [None]:
clf_grid.fit(x_train, y_train)
best_param = clf_grid.best_params_
best_param

In [None]:
#fit the best model
cat_model_final = CatBoostClassifier(iterations=500,
                           loss_function=best_param['loss_function'],
                           depth=best_param['depth'],
                           l2_leaf_reg=best_param['l2_leaf_reg'],
                           eval_metric='AUC',
                           leaf_estimation_iterations=10,
                           use_best_model=True,
                           logging_level='Silent',
                           random_seed=42
                          )

In [None]:
train_pool = Pool(x_train, y_train)

In [None]:
cat_model_final.fit(train_pool, eval_set=(x_valid,y_valid))


In [None]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv', index_col = 'id')

In [None]:
#get an ensemble model based on both catboost and logistic regression
pred = cat_model_final.predict(label_test)

In [None]:
submission['target'] = pred

In [None]:
submission.reset_index(inplace = True)

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)