# Mounts drive

In [1]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [2]:
# import os
# path_folder = '/content/drive/MyDrive/Pre-Practice'
# os.chdir(path_folder)
# print('current_directory:', os.getcwd())
# !ls

# Imports

In [3]:
# from IPython.display import clear_output
# !pip install -U lazypredict
# !pip install -U pandas # Upgrading pandas

# clear_output()

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots


from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split


from lightgbm import LGBMClassifier
# import lazypredict
# from lazypredict.Supervised import LazyClassifier


import time
import warnings
warnings.filterwarnings('ignore')

# Data Loading and Preparation

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('submission_sample.csv')

## Column Descriptions:
* `id` - client id
* `age` - (numeric)
* `job` - type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
* `martial` - marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
* `education` - (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
* `default` - has credit in default? (categorical: 'no','yes','unknown')
* `balance` - (numeric)
* `housing` - has housing loan? (categorical: 'no','yes','unknown')
* `loan` - has personal loan? (categorical: 'no','yes','unknown')
* `contact` - contact communication type (categorical: 'cellular','telephone')
* `month` - last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
* `duration` - last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
* `campaign` - number of contacts performed during this campaign and for this client (numeric, includes last contact)
* `pdays` - number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
* `previous` - number of contacts performed before this campaign and for this client (numeric)
* `poutcome` - outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
* `y`

## Exploring Train Data


### Quick view of Train Data

In [6]:
train.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,month,duration,campaign,pdays,previous,poutcome,y
0,1,30,unemployed,married,primary,no,1787,no,no,cellular,oct,79,1,-1,0,unknown,no
1,2,33,services,married,secondary,no,4789,yes,yes,cellular,may,220,1,339,4,failure,no
2,3,35,management,single,tertiary,no,1350,yes,no,cellular,apr,185,1,330,1,failure,no
3,4,30,management,married,tertiary,no,1476,yes,yes,unknown,jun,199,4,-1,0,unknown,no
4,5,59,blue-collar,married,secondary,no,0,yes,no,unknown,may,226,1,-1,0,unknown,no


In [7]:
print(f'\033[94m')
print(f'Number of rows in train data: {train.shape[0]}')
print(f'Number of columns in train data: {train.shape[1]}')
print(f'Number of values in train data: {train.count().sum()}')
print(f'Number of missing values in train data: {sum(train.isna().sum())}')

[94m
Number of rows in train data: 3999
Number of columns in train data: 17
Number of values in train data: 67983
Number of missing values in train data: 0


### Column Wise missing values

In [8]:
print(f'\033[94m')
print(train.isna().sum())

[94m
id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


### Basic statistics of training data

In [9]:
train.describe()

Unnamed: 0,id,age,balance,duration,campaign,pdays,previous
count,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0
mean,2000.0,41.204551,1416.183796,261.906477,2.780695,39.365341,0.540385
std,1154.556192,10.580862,3017.975038,258.845954,3.083492,99.533237,1.651415
min,1.0,19.0,-2082.0,4.0,1.0,-1.0,0.0
25%,1000.5,33.0,68.0,104.0,1.0,-1.0,0.0
50%,2000.0,39.0,445.0,184.0,2.0,-1.0,0.0
75%,2999.5,49.0,1470.0,325.5,3.0,-1.0,0.0
max,3999.0,87.0,71188.0,3025.0,50.0,871.0,24.0


In [10]:
train.describe(include='O')

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
count,3999,3999,3999,3999,3999,3999,3999,3999,3999,3999
unique,12,3,4,2,2,2,3,12,4,2
top,management,married,secondary,no,yes,no,cellular,may,unknown,no
freq,871,2490,2028,3930,2262,3385,2568,1241,3281,3541


## Exploring Test Data

### Quick view of Test Data

In [11]:
test.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,month,duration,campaign,pdays,previous,poutcome,y
0,4000,44,services,married,secondary,no,27,no,no,cellular,may,485,1,-1,0,unknown,no
1,4001,53,admin.,divorced,secondary,no,26,yes,no,cellular,may,56,1,359,1,failure,no
2,4002,36,technician,married,secondary,no,191,no,no,cellular,aug,69,1,-1,0,unknown,no
3,4003,58,technician,divorced,secondary,no,-123,no,no,cellular,aug,131,2,-1,0,unknown,no
4,4004,26,student,single,secondary,no,-147,no,no,unknown,jun,95,2,-1,0,unknown,no


In [12]:
print(f'\033[94m')
print(f'Number of rows in test data: {test.shape[0]}')
print(f'Number of columns in test data: {test.shape[1]}')
print(f'Number of values in test data: {test.count().sum()}')
print(f'Number of missing values in test data: {sum(test.isna().sum())}')

[94m
Number of rows in test data: 522
Number of columns in test data: 17
Number of values in test data: 8874
Number of missing values in test data: 0


### Column Wise missing values

In [13]:
print(f'\033[94m')
print(test.isna().sum())

[94m
id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


### Basic statistics of test data

In [14]:
test.describe()

Unnamed: 0,id,age,balance,duration,campaign,pdays,previous
count,522.0,522.0,522.0,522.0,522.0,522.0,522.0
mean,4260.5,40.90613,1472.254789,279.703065,2.89272,42.840996,0.559387
std,150.832689,10.546888,2947.355648,267.202666,3.305899,104.563234,1.988768
min,4000.0,21.0,-3313.0,7.0,1.0,-1.0,0.0
25%,4130.25,32.25,75.75,108.5,1.0,-1.0,0.0
50%,4260.5,39.0,439.5,200.0,2.0,-1.0,0.0
75%,4390.75,48.0,1535.25,355.25,3.0,-1.0,0.0
max,4521.0,84.0,26452.0,2456.0,44.0,804.0,25.0


In [15]:
test.describe(include='O')

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
count,522,522,522,522,522,522,522,522,522,522
unique,12,3,4,2,2,2,3,12,4,2
top,blue-collar,married,secondary,no,yes,no,cellular,may,unknown,no
freq,117,307,278,515,297,445,328,157,424,459


## Submission File

### Quick view of Submission File

In [16]:
sample_submission.head()

Unnamed: 0,id,y
0,4000,0.843403
1,4001,0.821234
2,4002,0.100092
3,4003,0.156351
4,4004,0.519778


# EDA

## Overview of Data

In [17]:
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)
TARGET = 'y'
FEATURES = [col for col in train.columns if col != TARGET]
RANDOM_STATE = 7

In [18]:
train.iloc[:, :-1].describe().T.sort_values(by='std', ascending=False)\
                    .style.background_gradient(cmap='GnBu')\
                    .bar(subset=['max'], color='#BB0000')\
                    .bar(subset=['mean'], color='green')

ImportError: Missing optional dependency 'Jinja2'. DataFrame.style requires jinja2. Use pip or conda to install Jinja2.

## Continuos and Categorical Data Distribution

In [None]:
df = pd.concat([train[FEATURES], test[FEATURES]], axis=0)
cat_features = [col for col in FEATURES if df[col].nunique() < 20]
cont_features = [col for col in FEATURES if df[col].nunique() > 20]

del df
print(f'\033[94m')
print(f'Number of features: {len(FEATURES)}')
print(f'Number of categorical features: {len(cat_features)}')
print(f'Number of continuos features: {len(cont_features)}')

labels = ['Categorical', 'Continuos']
values = [len(cat_features), len(cont_features)]
colors = ['#DE3163', '#58D68D']

fig = go.Figure(data=[go.Pie(
    labels=labels,
    values=values, pull=[0.05, 0],
    marker=dict(colors=colors,
                line=dict(color='#000000',
                          width=2))
)])
fig.show()

## Feature Distribution of Continous Features

In [None]:
for i, cont in enumerate(cont_features):
  train_cont = train.copy()
  test_cont = test.copy()
  train_cont["type"] = "Train"
  test_cont["type"] = "Test"
  contDf = pd.concat([train_cont, test_cont])
  fig = px.histogram(data_frame = contDf,
                    x=cont,
                    color= "type",
                    color_discrete_sequence =  ['#58D68D','#DE3163'],
                    marginal="box",
                    nbins= 100,
                      template="plotly_white"
                  )
  fig.update_layout(title = f"Distribution of {cont} ({i+1})" , title_x = 0.5)
  fig.show()

## Feature Distribution of Categorical Features

In [None]:
cat_features

In [None]:
ncols = 3
nrows = 3

fig, axes = plt.subplots(nrows, ncols, figsize=(32, 24))
for r in range(nrows):
  for c in range(ncols):
    col = cat_features[r*ncols + c]

    sns.countplot(train, x=col, ax = axes[r,c] ,palette = "viridis", label='Train data')
    sns.countplot(test, x=col, ax = axes[r,c] ,palette = "magma", label='Test data')
    axes[r,c].legend()
    axes[r,c].set_ylabel('')
    axes[r,c].set_xlabel(col, fontsize=20)
    axes[r,c].set_xticklabels(axes[r, c].get_xticklabels(), rotation=45)
    axes[r,c].tick_params(labelsize=10, width=0.5)
    axes[r,c].xaxis.offsetText.set_fontsize(4)
    axes[r,c].yaxis.offsetText.set_fontsize(4)
plt.show()

## Target Distribution

In [None]:
target_df = pd.DataFrame(train[TARGET].value_counts()).reset_index()
target_df.columns = [TARGET, 'count']
fig = px.bar(data_frame = target_df,
             x = TARGET,
             y = 'count'
             )
fig.update_traces(marker_color =['#58D68D','#DE3163'],
                  marker_line_color='rgb(0,0,0)',
                  marker_line_width=2,)
fig.update_layout(title = "Target Distribution",
                  template = "plotly_white",
                  title_x = 0.5)
print("\033[94mPercentage of", TARGET,"= no: {:.2f} %".format(target_df["count"][0] *100 / train.shape[0]))
print("\033[94mPercentage of", TARGET,"= yes: {:.2f} %".format(target_df["count"][1]* 100 / train.shape[0]))
fig.show()

## Correlation matrix

In [None]:
fig = px.imshow(train.corr(), text_auto=True, aspect="auto", color_continuous_scale="viridis")
fig.show()

# Data Pre-Processing

In [None]:
train.head()

In [None]:
label_cols = ["job", "marital", "education", "default", "housing", "contact", "month", "poutcome", "loan", "y"]
def label_encoder(train,test,columns):
  for col in columns:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)
    train[col] = LabelEncoder().fit_transform(train[col])
    test[col] =  LabelEncoder().fit_transform(test[col])
  return train, test

train ,test = label_encoder(train,test ,label_cols)

In [None]:
X = train.drop(TARGET , axis =1 )
y = train[TARGET]
X_train , X_test , y_train , y_test = train_test_split(X ,
                                                       y,
                                                       random_state = RANDOM_STATE ,
                                                       test_size =0.33)

# Modeling

In [None]:
clf = LazyClassifier(verbose=0,
                     ignore_warnings=True,
                     custom_metric=None,
                     predictions=False,
                     random_state=RANDOM_STATE,
                     classifiers='all')

models, predictions = clf.fit(X_train , X_test , y_train , y_test)
clear_output()

In [None]:
models[:15]

In [None]:
line = px.line(data_frame= models ,y =["ROC AUC"] , markers = True)
line.update_xaxes(title="Model",
              rangeslider_visible = False)
line.update_yaxes(title = "	ROC AUC	")
line.update_traces(line_color="red")
line.update_layout(showlegend = True,
    title = {
        'text': '	ROC AUC	 vs Model',
        'y':0.94,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

line.show()

In [None]:
lgb_params = {
    'objective' : 'binary',
    'n_estimators' :50,
    'learning_rate' : 0.08
}

lgb_predictions = 0
lgb_scores = []
lgb_fimp = []
LGBM_FEATURES = list(train.columns)[:-1]
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
for fold, (train_idx, valid_idx) in enumerate(skf.split(train[LGBM_FEATURES], train[TARGET])):
    print(f'\033[94m')
    print(10*"=", f"Fold={fold+1}", 10*"=")
    start_time = time.time()

    X_train, X_valid = train.iloc[train_idx][LGBM_FEATURES], train.iloc[valid_idx][LGBM_FEATURES]
    y_train , y_valid = train[TARGET].iloc[train_idx] , train[TARGET].iloc[valid_idx]

    model = LGBMClassifier(**lgb_params)
    model.fit(X_train, y_train)

    preds_valid = model.predict(X_valid)
    acc = roc_auc_score(y_valid,  preds_valid)
    lgb_scores.append(acc)
    run_time = time.time() - start_time

    fim = pd.DataFrame(index=LGBM_FEATURES,
                 data=model.feature_importances_,
                 columns=[f'{fold}_importance'])
    lgb_fimp.append(fim)

    print(f"Fold={fold+1}, Accuracy score: {acc}%, Run Time: {run_time:.2f}s")
    test_preds = model.predict(test[LGBM_FEATURES])
    lgb_predictions += test_preds/5
print("")
print("Mean Accuracy :", np.mean(lgb_scores))

In [None]:
X_train

# Submission

## Autogluon

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
train_df , test_df= train_test_split(train,
                                     random_state = 7,
                                     test_size =0.33)

In [None]:
metric = 'roc_auc'
predictor = TabularPredictor(label='y', eval_metric=metric).fit(train_df, time_limit=500, presets='best_quality')

In [None]:
test_data_nolabel = test_df.drop(['y'], axis=1)

In [None]:
y_pred = predictor.predict_proba(test_data_nolabel)
perf = predictor.evaluate_predictions(y_true=test_df['y'], y_pred=y_pred, auxiliary_metrics=True)

In [None]:
test_y = LabelEncoder().fit_transform(test_df['y'])
test_y

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# AUC (Train)
fpr, tpr, thresholds = roc_curve(y_true = test_y, y_score = y_pred['yes'])

plt.plot(fpr, tpr, label='roc curve (area = %0.3f)' % auc(fpr, tpr))
plt.legend()
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.show()

In [None]:
y_pred

In [None]:
predictor.leaderboard(test_df, silent=True)