In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency 

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')

In [None]:
train.info()

In [None]:
train_num = train.select_dtypes(exclude = ['object'])
train_num.describe()
train_cat = train.select_dtypes('object')

# Uni variate analysis

In [None]:
for i in train_cat:
    plt.figure(figsize=(20,6))
    #train_cat[i].value_counts().plot.bar()
    sns.countplot(train_cat[i])

In [None]:
train_num['target'].value_counts().plot.bar()

# We found that we have implanced classes so we will use f1 & ROC score 

In [None]:
for i in train_num:
    plt.figure(figsize=(20,6))
    #train_cat[i].value_counts().plot.bar()
    sns.distplot(train_num[i])

# Cheack for the skewness

In [None]:
skewed_col = []
for i in train_num.columns:
    if abs(train_num[i].skew()) > 0.5:
        skewed_col.append(i)

In [None]:
skewed_col

# - cont7, cont8, cont9, cont10 are skewed and we will need to take the log

# Detect Outliers

In [None]:
def outlier_detector(data, col_name):
    upper_lim = data[col_name].quantile(.95)
    lower_lim = data[col_name].quantile(.05)
    data = data[(data[col_name] < lower_lim) | (data[col_name] > upper_lim)][col_name]
    if len(data) > 0:
        print(col_name)

In [None]:
for i in train_num.columns:
    outlier_detector(train_num, i)

# All Have outliers

# Bivariate Analysis

In [None]:
train_num.drop(columns = ['target', 'id'], inplace = True)

In [None]:
corr = train_num.corr()
plt.figure(figsize=(25,25))
sns.heatmap(corr, annot = True)
plt.show()

# 1- cont0 and cont10 have multicoleration 0.81
# 2- cont1 and cont2 have multicoleration 0.86

# Let's see if we have the same distribution for the target category or not 

In [None]:
for i in train_num.columns:
    print(train.groupby('target')[i].mean())
    print('===============')

In [None]:
for col in train_num.columns:
    fig, (axs1, axs2) = plt.subplots(nrows = 1, ncols= 2, figsize=(15,6))
    sns.boxplot(x="target", y=col, data=train, ax = axs1)
    sns.distplot(train[train['target'] == 1][col], color = 'red', ax = axs2)
    sns.distplot(train[train['target'] == 0][col], ax = axs2)

So we found that cont0 have the same mean (same distribution) so it means that this feature don't affect in the output


Based on the box plot all features has outliers except cont1,2,3,4,6

In [None]:
pd.crosstab(train['target'],train['cat0'])

In [None]:
pd.DataFrame(train[['cat0','target']].value_counts())

In [None]:
def dependency(data,feature):
    
    table = pd.crosstab(data['target'],data[feature])
    stat, p, dof, excpected = chi2_contingency(table) 

    print(dof)
    significance_level = 0.05
    print("p value: " + str(p)) 

    if p <= significance_level: 
        print( 'REJECT NULL HYPOTHESIS, THE VARIABLES ARE DEPENDENT') 
    else: 
        print('{}: ACCEPT NULL HYPOTHESIS, THE VARIABLES ARE INDEPENDENT') 

In [None]:
for i in train_cat.columns:
    dependency(train,i)

<h1>Data Prepartion </h1>

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.utils import resample, shuffle
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split, RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter
from sklearn.utils import class_weight

In [None]:
X0 = train[train['target'] == 0]
X1 = train[train['target'] == 1]
print(f'number of majority classes before downsampling: {X0.shape[0]}')
X0_downsample = resample(X0,
                         replace=True,
                         n_samples=len(X1),
                         random_state=42)
print(f'number of majority classes before downsampling: {X0_downsample.shape[0]}')

In [None]:
balanced_df = pd.concat([X0_downsample, X1])
balanced_df = shuffle(balanced_df)

# after down sampling
X = balanced_df.drop('target', axis = 1)
y = balanced_df['target']
X.drop(['id', 'cat10'], axis = 1, inplace = True)
test.drop(['id', 'cat10'], axis = 1, inplace = True)

# keep the same data to use XGBoost
X2 = train.drop('target', axis = 1)
y2 = train['target']
X2.drop(['id', 'cat10'], axis = 1, inplace = True)

In [None]:
# create interaction of correlated variables and remove the correlated variables...

X['cont_0_10'] = X['cont0'] * X['cont10']
X2['cont_0_10'] = X2['cont0'] * X2['cont10']
X['cont_1_2'] = X['cont1'] * X['cont2']
X2['cont_1_2'] = X2['cont1'] * X2['cont2']
X.drop(['cont0', 'cont10', 'cont1', 'cont2'], axis = 1, inplace = True)
X2.drop(['cont0', 'cont10', 'cont1', 'cont2'], axis = 1, inplace = True)
test['cont_0_10'] = test['cont0'] * test['cont10']
test['cont_1_2'] = test['cont1'] * test['cont2']
test.drop(['cont0', 'cont10', 'cont1', 'cont2'], axis = 1, inplace = True)

In [None]:
numerical_cols = [col for col in X.columns if X[col].dtype != 'object']
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']

In [None]:
skewed_features = [col for col in numerical_cols if abs(X[col].skew()) > 0.5]
skewed_features

In [None]:
from collections import Counter
def cumulatively_categorise(column,threshold=0.75,return_categories_list=True):
  #Find the threshold value using the percentage and number of instances in the column
  threshold_value=int(threshold*len(column))
  #Initialise an empty list for our new minimised categories
  categories_list=[]
  #Initialise a variable to calculate the sum of frequencies
  s=0
  #Create a counter dictionary of the form unique_value: frequency
  counts=Counter(column)

  #Loop through the category name and its corresponding frequency after sorting the categories by descending order of frequency
  for i,j in counts.most_common():
    #Add the frequency to the global sum
    s+=dict(counts)[i]
    #Append the category name to the list
    categories_list.append(i)
    #Check if the global sum has reached the threshold value, if so break the loop
    if s>=threshold_value:
        break
  #Append the category Other to the list
  categories_list.append('Other')

  #Replace all instances not in our new categories by Other  
  new_column=column.apply(lambda x: x if x in categories_list else 'Other')

  #Return transformed column and unique values if return_categories=True
  if(return_categories_list):
        return new_column,categories_list
  #Return only the transformed column if return_categories=False
  else:
        return new_column

In [None]:
good_label_cols=[i for i in categorical_cols if set(X[i])==set(test[i])]
bad_label_cols = list(set(categorical_cols)-set(good_label_cols))
print('good label cols \n', good_label_cols)
print('bad label cols \n', bad_label_cols)

In [None]:
for col in categorical_cols:
    X[col] = pd.Categorical(X[col]).codes
    X2[col] = pd.Categorical(X2[col]).codes
    test[col] = pd.Categorical(test[col]).codes

In [None]:
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out

In [None]:
preprocessor = ColumnTransformer ([
    ('log_trans', FunctionTransformer(func = np.log1p, validate = False), skewed_features),
    ('standardize', StandardScaler(), numerical_cols),
    #('polynomials', PolynomialFeatures(degree = 2, interaction_only=True), numerical_cols),
    #('cat_interactions', PolynomialFeatures(degree = 2, interaction_only=True), categorical_cols),
    ('labeling', OneHotEncoder(), good_label_cols)
#   ('pca', PCA(n_components = 15), good_label_cols)
])

In [None]:
X = preprocessor.fit_transform(X)
test_df = preprocessor.transform(test)

In [None]:
param_grid = {'penalty' : ['l1', 'l2', 'elasticnet'],
             'C':[0.1,0.5,0.001,0.005]}
log_reg = LogisticRegression(class_weight = {0:0.25, 1:0.75})
grid_cv = GridSearchCV(log_reg, param_grid, cv=5, scoring = 'roc_auc')
grid_cv.fit(X,y)
y_pred = grid_cv.predict_proba(test_df)

In [None]:
boost1 = CatBoostClassifier(task_type="GPU",
                           loss_function="Logloss",
                           eval_metric="AUC",
                           random_state = 42,
                           class_weights = [0.68015181, 1.8877185],
                           iterations = 1000,
                           learning_rate = 0.07681584317112851,
                           depth = 7,
                           l2_leaf_reg = 26,
                           verbose = False)

In [None]:
boost1.fit(X, y)
y_pred = boost1.predict_proba(test_df)

In [None]:
def solver(x):
    if x > 0.5:
        x = x + 0.05
    else:
        x = x - 0.05
    return x

In [None]:
Y

In [None]:
def jjj(arr):
    lst = []
    for val in arr:
        if val > 0.5:
            lst.append(val + 0.05)
        else:
            lst.append(val - 0.05)
    return np.array(lst)


In [None]:
ooo = jjj(Y)
ooo

In [None]:
Y = y_pred[:, 1]
Y_ones = pd.DataFrame(Y).apply(lambda x : solver(x))

In [None]:
sub_id = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv')['id']
sub_data = pd.DataFrame({
    'id':sub_id,
    'target':ooo
})
sub_data.to_csv('submission.csv', index = False)