In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd
from scipy.stats import chi2_contingency
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')

In [None]:
train_df.info()

In [None]:
numerical_features = train_df.drop('target', axis = 1).select_dtypes(exclude = ['object']).columns.to_list()
categorical_features = train_df.select_dtypes(include = ['object']).columns.to_list()

<h1> Univariate Analysis </h1>

In [None]:
# check bar plots of all variables
for col in categorical_features:
    plt.figure(figsize = (20, 6))
    train_df[col].value_counts().plot.bar()
    plt.title(f'distribution of {col}')
    plt.xlabel(f'{col}')
plt.show()

In [None]:
# plot distribution of target variable to check unbalacing....
plt.figure(figsize = (20, 6))
train_df['target'].value_counts().plot.bar()
plt.title(f'distribution of target')
plt.xlabel('target variable')
plt.show()

<p> problem of Unbalaced target variable </p>

In [None]:
# distribution of numerical data, check skewness, check outliers(0.05, 0.95)
numerical_features.remove('id')
for col in numerical_features:
    plt.figure(figsize = (20, 6))
    sns.distplot(a = train_df[col])
    plt.title(f'distribution of {col}')
    plt.xlabel(f'{col}')
plt.show()

In [None]:
skewed_features = [col for col in numerical_features if abs(train_df[col].skew()) > 0.5]
skewed_features

In [None]:
def outlier_detector(data, col_name):
    IQR = data[col_name].quantile(.75) - data[col_name].quantile(.25)
    upper_lim = data[col_name].quantile(.75) + 1.5 * IQR
    lower_lim = data[col_name].quantile(.25) - 1.5 * IQR
    data = data[(data[col_name] < lower_lim) | (data[col_name] > upper_lim)][col_name]
    if len(data) > 0:
        print(f'{len(data)} outlier')
        print(f'{col_name} has outliers.')
for col in numerical_features:
    outlier_detector(train_df, col)

<p>cont8, cont9, cont10 have outliers.</p>

In [None]:
IQR = train_df['cont8'].quantile(.75) - train_df['cont8'].quantile(.25)
upper_lim = train_df['cont8'].quantile(.75) + 1.5 * IQR
lower_lim = train_df['cont8'].quantile(.25) - 1.5 * IQR
train_df[(train_df['cont8'] < lower_lim) | (train_df['cont8'] > upper_lim)]['target'].value_counts()

In [None]:
IQR = train_df['cont10'].quantile(.75) - train_df['cont10'].quantile(.25)
upper_lim = train_df['cont10'].quantile(.75) + 1.5 * IQR
lower_lim = train_df['cont10'].quantile(.25) - 1.5 * IQR
train_df[(train_df['cont10'] < lower_lim) | (train_df['cont10'] > upper_lim)]['target'].value_counts()

<p> 1487 of outliers in cont8 column are of class 1 out of 1915</p>

<h1> Bivariate Analysis </h1>

In [None]:
# check multicollinearity..
# check correlation between numerical features that exceeds 0.8
plt.figure(figsize = (15, 10))
sns.heatmap(train_df[numerical_features].corr(), annot = True)
plt.show()

<ul>
    <li> cont10 and cont0 are highly correlated with 0.81 </li>
    <li> cont1 and cont2 are highly correlated with 0.86 </li>
</ul>

In [None]:
for col in numerical_features:
    fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (15, 6))
    sns.boxplot(data = train_df, x = 'target', y = col, ax = ax1)
    sns.distplot(train_df[train_df['target'] == 1][col], ax = ax2, color = 'red', label = '1')
    sns.distplot(train_df[train_df['target'] == 0][col], ax = ax2, label = '0')
plt.show()

In [None]:
def chi2(df, target, feature):
    table = pd.crosstab(df[target], df[feature])
    chi2, p, dof, expected = chi2_contingency(table)
    print(f'P-Value: {p}')
    print(f'Statistic: {chi2}')
    if p < 0.05:
        print('Two Variables are dependent..')
    else:
        print('Two Variables are independent..')

In [None]:
for col in categorical_features:
    print(f'check dependancy for {col}')
    chi2(train_df, 'target', col)
    print('=====================')

<h1>Data Prepartion </h1>

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils import resample, shuffle
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split, RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter
from sklearn.utils import class_weight

In [None]:
X0 = train_df[train_df['target'] == 0]
X1 = train_df[train_df['target'] == 1]
print(f'number of majority classes before downsampling: {X0.shape[0]}')
X0_downsample = resample(X0,
                         replace=True,
                         n_samples=len(X1),
                         random_state=42)
print(f'number of majority classes before downsampling: {X0_downsample.shape[0]}')

In [None]:
balanced_df = pd.concat([X0_downsample, X1])
balanced_df = shuffle(balanced_df)
# after down sampling
X = balanced_df.drop('target', axis = 1)
y = balanced_df['target']
X.drop(['id', 'cat10'], axis = 1, inplace = True)
test_df.drop(['id', 'cat10'], axis = 1, inplace = True)
# keep the same data to use XGBoost
X2 = train_df.drop('target', axis = 1)
y2 = train_df['target']
X2.drop(['id', 'cat10'], axis = 1, inplace = True)

In [None]:
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y2
)
classes_weights

In [None]:
# create interaction of correlated variables and remove the correlated variables...
X['cont_0_10'] = X['cont0'] * X['cont10']
X2['cont_0_10'] = X2['cont0'] * X2['cont10']
X['cont_1_2'] = X['cont1'] * X['cont2']
X2['cont_1_2'] = X2['cont1'] * X2['cont2']
X.drop(['cont0', 'cont10', 'cont1', 'cont2'], axis = 1, inplace = True)
X2.drop(['cont0', 'cont10', 'cont1', 'cont2'], axis = 1, inplace = True)
test_df['cont_0_10'] = test_df['cont0'] * test_df['cont10']
test_df['cont_1_2'] = test_df['cont1'] * test_df['cont2']
test_df.drop(['cont0', 'cont10', 'cont1', 'cont2'], axis = 1, inplace = True)

In [None]:
def outliers_cont8_ind(x):
    if x > upper_lim:
        return 1
    else:
        return 0

In [None]:
X['cont8_outliers_ind'] = X['cont8'].apply(outliers_cont8_ind)
test_df['cont8_outliers_ind'] = test_df['cont8'].apply(outliers_cont8_ind)
X2['cont8_outliers_ind'] = X2['cont8'].apply(outliers_cont8_ind)

In [None]:
numerical_cols = [col for col in X.columns if X[col].dtype != 'object']
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']

In [None]:
skewed_features = [col for col in numerical_cols if abs(X[col].skew()) > 0.5]
skewed_features

In [None]:
from collections import Counter
def cumulatively_categorise(column,threshold=0.75,return_categories_list=True):
  #Find the threshold value using the percentage and number of instances in the column
  threshold_value=int(threshold*len(column))
  #Initialise an empty list for our new minimised categories
  categories_list=[]
  #Initialise a variable to calculate the sum of frequencies
  s=0
  #Create a counter dictionary of the form unique_value: frequency
  counts=Counter(column)

  #Loop through the category name and its corresponding frequency after sorting the categories by descending order of frequency
  for i,j in counts.most_common():
    #Add the frequency to the global sum
    s+=dict(counts)[i]
    #Append the category name to the list
    categories_list.append(i)
    #Check if the global sum has reached the threshold value, if so break the loop
    if s>=threshold_value:
        break
  #Append the category Other to the list
  categories_list.append('Other')

  #Replace all instances not in our new categories by Other  
  new_column=column.apply(lambda x: x if x in categories_list else 'Other')

  #Return transformed column and unique values if return_categories=True
  if(return_categories_list):
        return new_column,categories_list
  #Return only the transformed column if return_categories=False
  else:
        return new_column

In [None]:
good_label_cols=[i for i in categorical_cols if set(X[i])==set(test_df[i])]
bad_label_cols = list(set(categorical_cols)-set(good_label_cols))
print('good label cols \n', good_label_cols)
print('bad label cols \n', bad_label_cols)

In [None]:
for col in categorical_cols:
    X[col] = pd.Categorical(X[col]).codes
    X2[col] = pd.Categorical(X2[col]).codes
    test_df[col] = pd.Categorical(test_df[col]).codes

In [None]:
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out

In [None]:
preprocessor = ColumnTransformer ([
    ('log_trans', FunctionTransformer(func = np.log1p, validate = False), skewed_features),
    ('standardize', StandardScaler(), numerical_cols),
    #('polynomials', PolynomialFeatures(degree = 2, interaction_only=True), numerical_cols),
    #('cat_interactions', PolynomialFeatures(degree = 2, interaction_only=True), categorical_cols),
    ('labeling', OneHotEncoder(), good_label_cols)
])

In [None]:
preprocessor.fit(X2)

In [None]:
X2 = preprocessor.transform(X2)
test_df = preprocessor.transform(test_df)

In [None]:
# use Bayesian Search with CatBoost and XGBoost
'''
model = [
    {
        'name': 'CatBoost Classifier',
        'estimator': CatBoostClassifier(task_type="GPU",
                                        loss_function="Logloss",
                                        eval_metric="AUC",
                                        random_state = 42,
                                        class_weights = [0.68015181, 1.8877185],
                                        verbose = False),
        'hyperparameters':{
            'depth' : Integer(3, 14),
            'learning_rate' : Real(0.01, 1.0, 'log-uniform'),
            'iterations' : Integer(10, 1500),
            'l2_leaf_reg': Integer(2, 30)
        }
    }
]
for i in model:
    print(i['name'])
    bs=BayesSearchCV(i['estimator'],
                     search_spaces=i['hyperparameters'],
                     cv=2, n_jobs=-1,
                     scoring='roc_auc',
                     return_train_score=True,
                     n_iter = 30)
    bs.fit(X2, y2)
    print('best score: ', bs.best_score_)
    print('best parameters ; ', bs.best_params_)
    print('best model: ', bs.best_estimator_)
'''

<p>
    {
        'name': 'CatBoost Classifier',<br>
        'estimator': CatBoostClassifier(task_type="GPU",<br>
                                        loss_function="Logloss",<br>
                                        eval_metric="AUC",<br>
                                        random_state = 42,<br>
                                        class_weights = [0.68015181, 1.8877185],<br>
                                        verbose = False),<br>
        'hyperparameters':{<br>
            'depth' : Integer(3, 14),<br>
            'learning_rate' : Real(0.01, 1.0, 'log-uniform'),<br>
            'iterations' : Integer(10, 1500),<br>
            'l2_leaf_reg': Integer(2, 30)<br>
        }
    }
</p>

<p>
    #CatBoost Classifier<br>
best score:  0.8914577363279153<br>
best parameters ;  OrderedDict([('depth', 8), ('iterations', 1051), ('l2_leaf_reg', 30), ('learning_rate', 0.04913338461951537)])<br>
    #Logistic Regression<br>
    best score:  0.8806274014930269<br>
best parameters ;  {'penalty': 'l1', 'solver': 'saga'}
</p>

In [None]:
boost1 = CatBoostClassifier(task_type="GPU",
                           loss_function="Logloss",
                           eval_metric="AUC",
                           random_state = 42,
                           class_weights = [0.68015181, 1.8877185],
                           iterations = 1051,
                           learning_rate = 0.04913338461951537,
                           depth = 8,
                           l2_leaf_reg = 30,
                           verbose = False)
boost1.fit(X2, y2)

In [None]:
y_train_pred_probs = boost1.predict_proba(X2)[:, 1]
fpr, tpr, _ = roc_curve(y2, y_train_pred_probs)
plt.plot(
    fpr,
    tpr,
    color="darkorange"
)
plt.plot([0, 1], [0, 1], color="navy", linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="best")
plt.show()

In [None]:
y_train_pred = boost1.predict(X2)
cm_display = ConfusionMatrixDisplay(confusion_matrix(y2, y_train_pred)).plot()

In [None]:
y_pred = boost1.predict_proba(test_df)
Y = y_pred[:, 1]

In [None]:
sub_id = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv')['id']
sub_data = pd.DataFrame({
    'id':sub_id,
    'target':Y
})
sub_data.to_csv('submission.csv', index = False)