## Predictions on holdout set

In [30]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [31]:
holdout = pd.read_csv('df_holdout_scholarjet.csv').drop('Unnamed: 0',axis=1)

### Classification

Helper functions

**miss_handler**: Handle's missing data replaces maxnps,minnps,avgnps with mean i.e 9 and rest with zeroes.<br>
**label_encoder**: Labels categorical data<br>
**drop_corr**: drops highly correlated variables with a threshold of 0.9 <br>
**classification**: loades the saved models and performs predictions, returns probailities.

In [32]:
def miss_handler(df):
    df_holdout = df.copy()
    print("Imputing missing values")
    df_holdout['maxnps'] = df_holdout['maxnps'].fillna(9)
    print("Imputed maxnps")

    df_holdout['minnps'] = df_holdout['minnps'].fillna(9)
    print("Imputed minnps")

    df_holdout['avgnps'] = df_holdout['avgnps'].fillna(9)
    print("Imputed avgnps")
    df_holdout.fillna(0,inplace=True)
    print("Imputing remaining columns")
    return df_holdout

def label_encoder(df):
    df_labeled = df.copy()
    print("labeling categorical columns")
    cat_cols = df_labeled.select_dtypes(include='object').columns
    df_labeled[cat_cols] = df_labeled[cat_cols].apply(LabelEncoder().fit_transform)
    return df_labeled


def drop_corr(df_todrop):
    df = df_todrop.copy()
    to_drop = ['dayssincelastord','cuidshare','numstores','avgnps','maxnps','numvisittotal',
               'numskusviewedone','numskusviewedthreeone','numskusviewedseventhree',
               'numskusviewedthirtyseven','numskusviewedsixtythirty','numskusviewedyearsixty']
    df = df.drop(to_drop,axis=1)
    print("Dropped correlated columns")
    return df

def classification(df_processed):
    model_lgbm = pickle.load(open('lgbm_corr_auc0.725acc0.767.pkl', 'rb'))
    model_xg = pickle.load(open('xg_corr_auc0.723acc0.765.pkl','rb'))
    lgbm_proba = model_lgbm.predict_proba(df_processed.drop(['cuid'],axis=1))
    xg_proba = model_xg.predict_proba(df_processed.drop(['cuid'],axis=1))
    weighted_proba = (lgbm_proba*0.63+xg_proba*0.37)[:,1]
    flat_proba = weighted_proba
    return flat_proba
#     for sublist in list(weighted_proba):
#         for i in sublist:
#             flat_proba.append(i)
#     return pd.Series(flat_proba)

In [33]:
df_processed = miss_handler(holdout)
df_processed = drop_corr(df_processed)
df_processed = label_encoder(df_processed)


Imputing missing values
Imputed maxnps
Imputed minnps
Imputed avgnps
Imputing remaining columns
Dropped correlated columns
labeling categorical columns


In [34]:
prdic_proba_holdout = classification(df_processed)

In [36]:
holdout['convert_30_prob'] = prdic_proba_holdout
holdout['convert_30'] = np.where(holdout['convert_30_prob']>0.5,1,0)

In [37]:
predicted_conv = holdout

In [38]:
predicted_conv.head()

Unnamed: 0,cuid,roll_up,currentstatus,companytypegroup,team,customersource,accrole,num_employees,num_purchases_year,cost_purchases_year,...,percemailclickedthreeone,percemailclickedseventhree,percemailclickedthirtyseven,percemailclickedsixtythirty,percemailclickedyearsixty,currentapplicability,numemaillist,dayssinceenrollment,convert_30_prob,convert_30
0,16838,Onboarding,Enrolled,Business,US,Internal Application,,1,1to2,lessthan1,...,0.0,0.0,0.021739,0.012821,0.032258,5.0,2.0,86,0.425504,0
1,532175,Onboarding,Enrolled,Business,US,Search - Paid,,6to10,,,...,0.0,0.0,0.0,0.0,0.0,,,3,0.374038,0
2,532176,Onboarding,Enrolled,Business,US,Internal Application,,11to50,3to5,1to5,...,0.0,0.0,0.0,0.0,0.015238,8.0,4.0,13,0.804787,1
3,532187,Onboarding,Enrolled,Business,US,Internal Application,Primary,,,,...,0.0,0.5,0.272727,0.25,0.0,,,10,0.751945,1
4,16938,Onboarding,Enrolled,Trade,US,Internal Customer Scrape,Primary,,,,...,0.0,0.0,0.0,0.043478,0.016461,5.0,2.0,42,0.557763,1


### Regression

**Loading the saved pickled model**

In [40]:
loaded_reg = pickle.load(open('reg_stacked.pkl', 'rb'))



In [41]:
holdout = pd.read_csv('df_holdout_scholarjet.csv').drop('Unnamed: 0',axis=1)

**Preprocessing**

In [42]:
df_processed = miss_handler(holdout) #handling missing values
df_processed = label_encoder(df_processed) # label encoding


Imputing missing values
Imputed maxnps
Imputed minnps
Imputed avgnps
Imputing remaining columns
labeling categorical columns


**Predictions**

In [44]:
preds_e = loaded_reg.predict(df_processed.drop(['cuid'],axis=1))
preds_e = np.expm1(preds_e)

In [45]:
cuid = np.array(holdout.cuid)
conv_predicted = np.array(predicted_conv.convert_30)
rev_predicted = np.array(preds_e)
pred_convert_30 = np.array(predicted_conv.convert_30_prob)

In [46]:
submission_beta = pd.DataFrame({'cuid':cuid, 'conv_predicted':conv_predicted,
                            'rev_predicted':rev_predicted,'pred_convert_30':pred_convert_30})

submission_beta.head()


Unnamed: 0,conv_predicted,cuid,pred_convert_30,rev_predicted
0,0,16838,0.425504,160.654454
1,0,532175,0.374038,596.881632
2,1,532176,0.804787,320.853348
3,1,532187,0.751945,841.311249
4,1,16938,0.557763,376.818398


**Modifying the predictions w.r.t classification**

In [47]:
conditions = [
    submission_beta['conv_predicted'] ==0,
    submission_beta['conv_predicted'] !=0
]


outputs = [0,np.NAN]

res = np.select(conditions, outputs, 'Other')
submission_beta['adjusted_rev'] = res

submission_beta=submission_beta.replace('nan',np.NaN)
submission_beta.isnull().sum()

submission_beta['pred_revenue_30'] = submission_beta['adjusted_rev'].fillna(submission_beta['rev_predicted'])

In [48]:
submission = submission_beta.drop(['conv_predicted','rev_predicted','adjusted_rev'],axis=1)

In [49]:
submission.head()

Unnamed: 0,cuid,pred_convert_30,pred_revenue_30
0,16838,0.425504,0.0
1,532175,0.374038,0.0
2,532176,0.804787,320.853
3,532187,0.751945,841.311
4,16938,0.557763,376.818


In [51]:
submission.to_csv('submission.csv')