In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

from scipy import stats
from scipy.stats import norm, skew

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, GroupKFold, GridSearchCV, StratifiedKFold

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import BayesianRidge,LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, OrthogonalMatchingPursuit
from sklearn.svm import SVR, NuSVR, LinearSVR
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
from sklearn.neighbors import KNeighborsRegressor, KernelDensity, KDTree
from sklearn.metrics import *

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import sys, os
import random 

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
from IPython import display, utils


def set_seed(seed=4242):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
set_seed()

In [None]:
train  = pd.read_csv('../input/imbalanced-data-practice/aug_train.csv')
train.head()

In [None]:
train.shape

In [None]:
train.info()

In [None]:
train.describe(include=['O'])

In [None]:
test  = pd.read_csv('../input/imbalanced-data-practice/aug_test.csv')
test.head()

In [None]:
target = train.Response
sns.set()
plt.figure(figsize=(8, 4))
sns.countplot(target)

In [None]:
cats = [c for c in train.columns if train[c].dtypes =='object']
print('Categories', cats)

nums = [c for c in train.columns if c not in cats]
print('Numerics', nums)

In [None]:
t_nums = ['Age', 'Annual_Premium','Vintage']
t_cats = ['Gender', 'Vehicle_Age', 'Vehicle_Damage', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Policy_Sales_Channel' ]

### Categories

In [None]:


def analyse_cats(df, cat_cols):
    d = pd.DataFrame()
    cl = []
    u = []
    s =[]
    nans =[]
    for c in cat_cols:
        #print("column:" , c ,"--Uniques:" , train[c].unique(), "--Cardinality:", train[c].unique().size)
        cl.append(c)
        u.append(df[c].unique())
        s.append(df[c].unique().size)
        nans.append(df[c].isnull().sum())
        
        fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
        sns.countplot(train[c], ax=axes[0], label='train', palette='bone');
        sns.countplot(test[c],  ax=axes[1], label='test');


        #fig.suptitle(features, fontsize=18);
        axes[0].set_title('train');
        axes[1].set_title('test');


    d['"feat"'] = cl
    d["uniques"] = u
    d["cardinality"] = s
    d["nans"] = nans

    return d
plt.style.use('ggplot')
catanadf = analyse_cats(train, t_cats)
catanadf

In [None]:
cats = [c for c in train.columns if train[c].dtypes =='object']
print('Categories', cats)

nums = [c for c in train.columns if c not in cats]
print('Numerics', nums)

In [None]:
for c in cats:
    le=LabelEncoder()
    le.fit(list(train[c].astype('str')) + list(test[c].astype('str')))
    train[c] = le.transform(list(train[c].astype(str))) 
    test[c] = le.transform(list(test[c].astype(str))) 
train.head()

In [None]:
del train['id']
del test['id']

target = train.pop('Response')

train.shape, test.shape

### LGB

In [None]:

lgb_params = {
    
    'objective': 'binary', 
    'boosting': 'gbdt', 
    'bagging_fraction': 0.9,
    'max_depth':-1,
    'bagging_frequency': 1,
 
    'feature_fraction': 0.9,
    'learning_rate': 0.02,
    'min_child_samples': 100,
 
    'num_leaves': 50,
    'metric':'auc', 
    'unbalance': True}
    
    
    
    


oof_lgb = np.zeros(len(train))
pred_lgb = np.zeros(len(test))

scores = []

feature_importances_gain = pd.DataFrame()
feature_importances_gain['feature'] = train.columns

feature_importances_split = pd.DataFrame()
feature_importances_split['feature'] = train.columns


folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold_, (train_ind, val_ind) in enumerate(folds.split(train, target)):
    print("fold : ---------------------------------------", fold_)
    trn_data = lgb.Dataset(train.iloc[train_ind], label=target.iloc[train_ind])#, categorical_feature=cat_cols)
    val_data= lgb.Dataset(train.iloc[val_ind], label=target.iloc[val_ind])#, categorical_feature=cat_cols)
    
    lgb_clf = lgb.train(lgb_params, trn_data, num_boost_round=2000, valid_sets=(trn_data, val_data), verbose_eval=100, early_stopping_rounds=100)
    oof_lgb[val_ind] = lgb_clf.predict(train.iloc[val_ind], num_iteration= lgb_clf.best_iteration)
    print("fold:", fold_, "roc_auc ==", roc_auc_score(target.iloc[val_ind], oof_lgb[val_ind]))
    scores.append(roc_auc_score(target.iloc[val_ind], oof_lgb[val_ind]))
    
    feature_importances_gain['fold_{}'.format(fold_ + 1)] = lgb_clf.feature_importance(importance_type='gain')
    feature_importances_split['fold_{}'.format(fold_ + 1)] = lgb_clf.feature_importance(importance_type='split')
    
    pred_lgb += lgb_clf.predict(test, num_iteration=lgb_clf.best_iteration)/folds.n_splits
    
print(' \\\\\\\\\\\\\\\ model roc_auc ////////////// : ' , np.mean(scores))
    
np.save('oof_lgb', oof_lgb)
np.save('pred_lgb', pred_lgb)

In [None]:
oof_lgb_01 = np.where(oof_lgb > 0.5, 1, 0)

confusion_matrix(target, (oof_lgb_01)) 


y_true = pd.Series([1, 0, 1])
y_pred = pd.Series([0, 0, 1]) 
confusion_matrix(y_true, y_pred)

In [None]:
pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
cf_matrix = confusion_matrix(target, (oof_lgb_01)) 
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.4)
plt.style.use('seaborn-poster')
sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='vlag')

In [None]:
feature_importances_gain['average'] = feature_importances_gain[['fold_{}'.format(fold + 1) for fold in range(folds.n_splits)]].mean(axis=1)
feature_importances_gain.to_csv('feature_importances.csv')

plt.figure(figsize=(20, 10))
sns.barplot(data=feature_importances_gain.sort_values(by='average', ascending=False).head(100),palette='bone',  x='average', y='feature');
plt.title('TOP n feature importance over {} folds average'.format(folds.n_splits));

### XGBoost

In [None]:
import xgboost as xgb
xgb_params = {
    
    'objective':'binary:logistic', 
    'max_depth': 6, 
    'learning_rate': 0.01, 
    'booster':'gbtree', 
    'eval_metric': 'auc', 
    'max_leaves': 16, 
    'colsample_bytree': 0.7, #feature fraction
    'subsample':0.6, # bagging fraction
    'lambda': 2, 
    
   
}



xgb_scores = []

oof_xgb = np.zeros(len(train))
pred_xgb = np.zeros(len(test))

importances = pd.DataFrame()


folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4242)

for fold_, (train_ind, val_ind) in enumerate(folds.split(train, target)):
    print('fold : ----------------------------------------', fold_)
    trn_data = xgb.DMatrix(data=train.iloc[train_ind], label=target.iloc[train_ind])
    val_data = xgb.DMatrix(data= train.iloc[val_ind], label=target.iloc[val_ind])
    
       
    xgb_model = xgb.train(xgb_params, trn_data, num_boost_round=1000, evals=[(trn_data, 'train'), (val_data, 'test')], verbose_eval=100, early_stopping_rounds=100)
    oof_xgb[val_ind] = xgb_model.predict(xgb.DMatrix(train.iloc[val_ind]),  ntree_limit= xgb_model.best_ntree_limit)
    
    print(roc_auc_score(target.iloc[val_ind], oof_xgb[val_ind]))
    xgb_scores.append(roc_auc_score(target.iloc[val_ind], oof_xgb[val_ind]))
        
    importance_score = xgb_model.get_score(importance_type='gain')
    importance_frame = pd.DataFrame({'Importance': list(importance_score.values()), 'Feature': list(importance_score.keys())})
    importance_frame['fold'] = fold_ +1
    importances = pd.concat([importances, importance_frame], axis=0, sort=False)
    
    pred_xgb += xgb_model.predict(xgb.DMatrix(test), ntree_limit= xgb_model.best_ntree_limit)/folds.n_splits
    
print('model auc:------------------>', np.mean(xgb_scores))

In [None]:
oof_xgb_01 = np.where(oof_xgb > 0.5, 1, 0)
cf_matrix = confusion_matrix(target, oof_xgb_01) 
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.4)
plt.style.use('seaborn-poster')
sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Greens')

In [None]:
#importances['gain_log'] = importances['gain']
mean_gain = importances[['Importance', 'Feature']].groupby('Feature').mean()
#importances['mean_score'] = importances['Feature'].map(mean_gain['Importance'])
mean_gain = mean_gain.reset_index()
plt.figure(figsize=(12, 10))
sns.barplot(x='Importance', y='Feature', data=mean_gain.sort_values('Importance', ascending=False).head(30), palette='bone')

### *Next Series kernels will be included techniques to handle imbalanced data. Stay tuned...*