- **Let us see how well our model would perform if we would deploy our model at the end of 2018**
- **ie: Let us test our model on 2019 data**

In [1]:
import numpy as np
import pandas as pd 
import  category_encoders as ce
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder

In [2]:
data_path = "../data/notebooks/4_merged_data.csv"

In [3]:
df_raw = pd.read_csv(data_path)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df = df_raw.copy()

In [5]:
cols = ['launched_at', 'status', 'days_to_deadline', 'goal',
       'sub_category', 'category', 'blurb_length', 'location_country',  'rewards_mean', 'rewards_median',
       'rewards_variance', 'rewards_SD', 'rewards_MIN', 'rewards_MAX' ,
        'rewards_NUM', 'currency', 'launch_year', 'launch_month',
         'deadline_month']
target_encoding_cols = ['location_country' , 'currency' , 'category', 'sub_category']

train_years =[ 2016, 2017  , 2018]
valid_years = [2019]


In [6]:
def pre_proc(df):
    df = df[cols]
    df= df.dropna(axis=0, subset=["rewards_MIN"])
    df= df.dropna(axis=0, subset=["blurb_length"])
    df = df.reset_index(drop=True)
    df["launched_at"]  = pd.to_datetime(df["launched_at"]).dt.date
    df.sort_values("launched_at" , inplace=True)
    df.drop(['launched_at'] ,axis=1 , inplace=True)
    df.reset_index(inplace=True)
    df.drop('index', inplace=True , axis=1)
    
    binarizer= LabelBinarizer()
    df["status"] = binarizer.fit_transform(df["status"])
    
    
    return df


    

In [7]:
def onehot_categ(df):
    
    encoder  = OneHotEncoder(sparse=False)
    cat_cols=['category', 'sub_category', 'currency', 'location_country']
    X_hot = encoder.fit_transform(df[cat_cols])
    
    onehotcols = []
    for cat in encoder.categories_:
        for col in cat:
            onehotcols.append(col)
            
    X_hot = pd.DataFrame(X_hot , columns=onehotcols)
    df =pd.concat([df , X_hot] , axis=1)
    df.drop(target_encoding_cols , axis=1 , inplace=True)
    
    
    return df 
    

In [8]:
def get_model_data(df , train_years , valid_years):
    df_train = df[df['launch_year'].apply(lambda x: True if x in train_years else False)]
    df_valid= df[df['launch_year'].apply(lambda x: True if x in valid_years else False)]
    
    X_train , y_train = df_train.drop(["status","launch_year"] , axis=1) , df_train['status']
    X_valid , y_valid = df_valid.drop(["status","launch_year"] , axis=1) , df_valid['status']
    
    return X_train , y_train , X_valid , y_valid

    

In [9]:
def helmert_categ(df_train , df_valid):
    encoder = ce.HelmertEncoder(cols = target_encoding_cols , drop_invariant=True )
    dfh = encoder.fit_transform(df_train[target_encoding_cols])
    df_train = pd.concat([df_train , dfh], axis=1)
    df_train.drop(target_encoding_cols , axis=1 , inplace=True)
    dfh = encoder.transform(df_valid[target_encoding_cols])
    df_valid = pd.concat([df_valid , dfh], axis=1)
    df_valid.drop(target_encoding_cols , axis=1 , inplace=True)
    
    return df_train , df_valid 

In [10]:
from xgboost import XGBClassifier
import operator

def XG_score(X_train, X_test, y_train, y_test):
    XG_fet = {}
    
    XG= XGBClassifier(n_estimators=150, random_state=9)
    XG.fit(X_train, y_train)
    XG_score = XG.score(X_test, y_test)
    
   
    feat_labels = X_train.columns.values
    
    for feature, acc in zip(feat_labels, XG.feature_importances_):
        XG_fet[feature] = acc
        
    XG_fet =  sorted(XG_fet.items(), key=operator.itemgetter(1), reverse=True)
  
        
    return (XG,XG_score, XG_fet)


In [11]:
df_proc  = pre_proc(df)
df_onehot   = onehot_categ(df_proc)
X_train_oh , y_train_oh , X_valid_oh , y_valid_oh  = get_model_data(df_onehot , train_years , valid_years)

In [12]:
df_proc  = pre_proc(df)
X_train_raw , y_train_hel , X_valid_raw , y_valid_hel  = get_model_data(df_proc , train_years , valid_years)
X_train_hel , X_valid_hel   = helmert_categ(X_train_raw , X_valid_raw)


In [13]:
XG_model_oh , XG_scores_oh , XG_fet_imp_oh= XG_score(X_train_oh , X_valid_oh , y_train_oh , y_valid_oh)
print("Score using OneHot encodinng: {}".format(XG_scores_oh))

Score using OneHot encodinng: 0.8151040114442392


In [14]:
XG_model_hel , XG_scores_hel , XG_fet_imp_hel= XG_score(X_train_hel , X_valid_hel , y_train_hel , y_valid_hel)
print("Score using Helmert encodinng: {}".format(XG_scores_hel))

Score using Helmert encodinng: 0.8627287357692078


- **This should is great, our test accuracy is greater than our validation accuracy, usually this should be a red flag but since there was not decision during the process of modeling and preprocessing made based off the 2019(test data), its fine**
- **In the next notebook we will train the model on the entire dataset and save the model**