In [1]:
import numpy as np   # import numpy
import pandas as pd  # import pandas
import os
import gc   # for gabage collection
import seaborn as sns  # data visualization lib
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
import operator
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score,precision_recall_fscore_support,classification_report,confusion_matrix
import glob
import lightgbm as lgb # load lightGBM model
import pickle
%matplotlib inline

In [1]:
DATA_PATH = '/kaggle/input/homesite-quote-conversion'
file_name = os.path.join(DATA_PATH,r'train.csv.zip')
file_name

In [1]:
df= pd.read_csv(file_name)
df.shape

In [1]:
#seperate the target 
y = df['QuoteConversion_Flag']
y

In [1]:
df.head()

In [1]:
# find correlation between the features and drop one of two highly correlated ones.
def highly_corr_col(x):
    corr_matrix = df[x].corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
    return to_drop

In [1]:
Field_col =[col for col in df if col.startswith('Field')]
CoverageField_col = [col for col in df if col.startswith('CoverageField')]
SalesField_col = [col for col in df if col.startswith('SalesField')]
personalField_col = [col for col in df if col.startswith('PersonalField')]
PropertyField_col = [col for col in df if col.startswith('PropertyField')]
GeographicField_col = [col for col in df if col.startswith('GeographicField')]
Field_col =[col for col in df if col.startswith('Field')]
CoverageField_col = [col for col in df if col.startswith('CoverageField')]
SalesField_col = [col for col in df if col.startswith('SalesField')]
personalField_col = [col for col in df if col.startswith('PersonalField')]
PropertyField_col = [col for col in df if col.startswith('PropertyField')]
GeographicField_col = [col for col in df if col.startswith('GeographicField')]

In [1]:
df.drop(highly_corr_col(Field_col), axis=1, inplace=True)
df.drop(highly_corr_col(CoverageField_col), axis=1, inplace=True)
df.drop(highly_corr_col(SalesField_col), axis=1, inplace=True)
df.drop(highly_corr_col(personalField_col), axis=1, inplace=True)
df.drop(highly_corr_col(PropertyField_col), axis=1, inplace=True)
df.drop(highly_corr_col(GeographicField_col), axis=1, inplace=True)

In [1]:
df.head()

In [1]:
# Convert str_type 'Date' into date_type
df['Date'] = pd.to_datetime(pd.Series(df['Original_Quote_Date']))

# Drop 'Original_Quote_date'
df = df.drop('Original_Quote_Date', axis=1)

# Extract year,month,weekday from 'Date'
df['Year'] = df['Date'].apply(lambda x: x.year)
df['Month'] = df['Date'].apply(lambda x: x.month)
df['weekday'] = df['Date'].apply(lambda x: x.weekday())
df['Quarter'] = df['Date'].apply(lambda x: x.quarter)

# Drop 'Date' feature
df = df.drop('Date', axis=1)

In [1]:
df.head()

In [1]:
df['Month'].value_counts()

In [1]:
import seaborn as sns
sns.distplot(df['Month'], kde=False)

In [1]:
import seaborn as sns
sns.distplot(df["Year"], kde=False)

In [1]:
import seaborn as sns
sns.distplot(df["weekday"], kde=False)

In [1]:
import seaborn as sns
sns.distplot(df["Quarter"], kde=False)

In [1]:
# Let us organize above table and sort the table in terms of # of NAN in descending order
nan_info = pd.DataFrame(df.isnull().sum()).reset_index()
nan_info.columns = ['feature_name','nan_cnt']
nan_info.sort_values(by = 'nan_cnt',ascending=False,inplace=True)
nan_info['nan_percentage'] = nan_info['nan_cnt']/len(df)
nan_info

In [1]:
nan_info.head(10)

In [1]:
features = [f for f in df.columns.values if f not in ['QuoteConversion_Flag','QuoteNumber']] # you have to customize this according to your own needs
print(features)

In [1]:
cols_with_missing = nan_info.loc[nan_info.nan_cnt>0].feature_name.values
print(cols_with_missing)

In [1]:
for f in cols_with_missing:
    print(f,':', df[f].dtype,' nan percentage:', nan_info.loc[nan_info.feature_name==f].nan_percentage.values[0])

In [1]:
def enc(x):
    le=preprocessing.LabelEncoder()
    le.fit(list(x.values))
    x=le.transform(list(x.values))
    return x

In [1]:
for ft in cols_with_missing:
    if df[ft].dtype == 'object':
        df[ft].fillna('unknown',inplace=True)
    else:
        df[ft].fillna(-1, inplace=True)
    
    print(enc(df[ft]))

In [1]:
#Convert all strings to equivalent numeric representations:
for f in df.columns:
     if df[f].dtype=='object':
            #print(df[f])
            print(enc(df[f]))

In [1]:
category_features = []
f_cat = []
threshold = 70
for each in features:

    if df[each].nunique() < threshold:
        category_features.append(each)
for each in category_features:
    df[each] = df[each].astype('category')
    #df_cat.append(each)
    #print(df[each])
    print(enc(df[each]))
    f_cat.append(each)

In [1]:
X = csr_matrix(pd.get_dummies(df[f_cat],drop_first=True,prefix=f_cat,sparse=True)).tocsr()
X

In [1]:
df.isnull().sum().sum()

Spliting the data into Training and test

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
X = X.toarray()
y = df['QuoteConversion_Flag'].values
X.shape,len(y)
 #split 20% data as test data
X_train,X_test,y_train,y_test = train_test_split(X,y,shuffle=True,random_state=2019,test_size=0.2)
#print(train_X.shape,test_X.shape,len(train_y),len(test_y))
X_train.shape,X_test.shape,len(y_train),len(y_test)

In [1]:
clf = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=18, max_leaf_nodes=64, verbose=1,
                                 n_jobs=4)
scores_rfc = []
# models1 = []
# initialize KFold, we vcan use stratified KFold to keep the same imblance ratio for target
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
for i, (train_idx, valid_idx) in enumerate(kf.split(X_train, y_train)):
    print('...... training {}th fold \n'.format(i + 1))
    tr_x = X_train[train_idx]
    tr_y = y_train[train_idx]

    val_x = X_train[valid_idx]
    val_y = y_train[valid_idx]
    model = clf
    model.fit(tr_x, tr_y)
    # picking best model?
    pred_val_y = model.predict(val_x)
    # measuring model vs validation
    score_rfc = roc_auc_score(val_y, pred_val_y)
    scores_rfc.append(score_rfc)
    print('current performance by auc:{}'.format(score_rfc))
# auc_scores1.append(auc)
# models1.append(model)
best_f1 = -np.inf
best_thred = 0
v = [i * 0.01 for i in range(50)]
for thred in v:
    preds = (pred_val_y > thred).astype(int)
    f1 = f1_score(val_y, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_thred = thred
y_pred_rfc = (pred_val_y > best_thred).astype(int)
print(confusion_matrix(val_y, y_pred_rfc))
print(f1_score(val_y, y_pred_rfc))
print('the average mean auc is:{}'.format(np.mean(scores_rfc)))

In [1]:
from sklearn.metrics import roc_auc_score  
from sklearn.metrics import roc_curve
model_lgb = lgb.LGBMClassifier(n_jobs=4, n_estimators=10000, boost_from_average='false', learning_rate=0.01,
                                num_leaves=64, num_threads=4, max_depth=-1, tree_learner="serial",
                                feature_fraction=0.7, bagging_freq=5, bagging_fraction=0.7, min_data_in_leaf=100,
                                silent=-1, verbose=-1, max_bin=255, bagging_seed=11, )
auc_scores = []
models = []
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
for i, (train_idx, valid_idx) in enumerate(kf.split(X_train, y_train)):
    print('...... training {}th fold \n'.format(i + 1))
    tr_x = X_train[train_idx]
    tr_y = y_train[train_idx]

    va_x = X_train[valid_idx]
    va_y = y_train[valid_idx]
    model = model_lgb  # you need to initialize your lgb model at each loop, otherwise it will overwrite
    model.fit(tr_x, tr_y, eval_set=[(tr_x, tr_y), (va_x, va_y)], eval_metric='auc', verbose=500,
                early_stopping_rounds=300)
# calculate current auc after training the model
    pred_va_y = model.predict_proba(va_x, num_iteration=model.best_iteration_)[:, 1]
    auc = roc_auc_score(va_y, pred_va_y)
    print('current best auc score is:{}'.format(auc))
    auc_scores.append(auc)
    models.append(model)

best_f1 = -np.inf
best_thred = 0
v = [i * 0.01 for i in range(50)]
for thred in v:
    preds = (pred_va_y > thred).astype(int)
    f1 = f1_score(va_y, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_thred = thred
y_pred_lgb = (pred_va_y > best_thred).astype(int)
print(confusion_matrix(va_y, y_pred_lgb))
print(f1_score(va_y, y_pred_lgb))
print('the average mean auc is:{}'.format(np.mean(auc_scores)))
fpr, tpr, _ = roc_curve(va_y, pred_va_y)
# plot model roc curve
plt.plot(fpr, tpr, marker='.', label='LGB model')
# axis labels
plt.title('ROC AUC CURVE')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.savefig('LGB ROC_auc_curve.png')
plt.show()
# Test data
pred_test_1 = models[0].predict_proba(X_test, num_iteration=models[0].best_iteration_)[:, 1]
pred_test_2 = models[1].predict_proba(X_test, num_iteration=models[1].best_iteration_)[:, 1]
pred_test_3 = models[2].predict_proba(X_test, num_iteration=models[2].best_iteration_)[:, 1]
pred_test_4 = models[3].predict_proba(X_test, num_iteration=models[3].best_iteration_)[:, 1]
pred_test_5 = models[4].predict_proba(X_test, num_iteration=models[4].best_iteration_)[:, 1]
pred_test = (pred_test_1 + pred_test_2 + pred_test_3 + pred_test_4 + pred_test_5) / 5.0
print(pred_test)        

In [1]:
# Logging for Visual Comparison
#log_cols=["Classifier", "AUC Score", "f1-Score"]
#log = pd.DataFrame(columns=log_cols)

#for clf in classifiers:
 #   clf.fit(X_train, y_train)
  #  name = clf.__class__.__name__
    
   # print("="*30)
    #print(name)
    
    #print('****Results****')
   # print('current best auc score is:{}'.format(auc))
    #train_predictions = clf.predict_proba(X_test)
    #ll = log_loss(y_test, train_predictions)
    #print("Log Loss: {}".format(ll))

In [1]:
#from sklearn.model_selection import cross_validate
#from sklearn.ensemble import RandomForestClassifier
#random_forest = RandomForestClassifier()

#scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
#scores = cross_validate(random_forest, X_train, y_train, scoring=scoring, cv=20)

#sorted(scores.keys())
#forest_fit_time = scores['fit_time'].mean()
#forest_score_time = scores['score_time'].mean()
#forest_accuracy = scores['test_accuracy'].mean()
#forest_precision = scores['test_precision_macro'].mean()
#forest_recall = scores['test_recall_macro'].mean()
#forest_f1 = scores['test_f1_weighted'].mean()
#forest_roc = scores['test_roc_auc'].mean()