# Hurdle modelling

In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


In [32]:
base_data = pd.read_csv("./test_param_performance.csv")
base_data['postal_code'] = base_data['postal_code'].astype(str)
dem_data = pd.read_csv("../data/non_colinear_demographic_data.csv")
dem_data['postal_code'] = dem_data['postal_code'].astype(int)
dem_data['postal_code'] = dem_data['postal_code'].astype(str)
emp_data = pd.read_csv("../data/non_colinear_employment_data.csv")
emp_data['postal_code'] = emp_data['postal_code'].astype(int)
emp_data['postal_code'] = emp_data['postal_code'].astype(str)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [33]:
device_weights = pd.read_csv("../data/device_type_weights_weights.csv", index_col=0)
t_of_day_weights = pd.read_csv("../data/time_of_day_weights.csv", index_col=0)

## get device score

In [34]:
class DeviceTypeScore:
    
    def __init__(self, weights):
        self.weights = weights
        
    def run(self, input_data):
        data = input_data.copy()
        return data['device_type'].map(self.weights)   

In [35]:
device_type_weights = dict(zip(device_weights.index, device_weights['device_type_weights'].values))
device_score = DeviceTypeScore(device_weights['device_type_weights']).run(base_data)

In [36]:
base_data['device_score'] = device_score
base_data.drop('device_type', inplace=True, axis=1)

## get time of week score

In [37]:
class TimeOfDayScore:
    
    def __init__(self, weights):
        self.weights = weights
        
    def run(self, input_data):
        data = input_data.copy()
        data['hour_of_week'] = data['weekday_user_tz'] * 24 + data['hour_user_tz']
        data['hour_of_week_score'] = data['hour_of_week'].map(self.weights)
        
        return data['hour_of_week_score']
        

In [38]:
t_of_day_type_weights = dict(zip(t_of_day_weights.index, t_of_day_weights['time_of_day_weights'].values))
t_of_day_score = TimeOfDayScore(t_of_day_weights['time_of_day_weights']).run(base_data)

In [39]:
base_data['time_of_day_score'] = t_of_day_score
base_data.drop(['weekday_user_tz', 'hour_user_tz'], inplace=True, axis=1)

## drop unused columns

In [40]:
base_data.drop(['advertiser_id', 'insertion_order_id', 'country', 'region', 'city',
                'dma', 'pixel_id', 'clicks', 'booked_revenue_adv_curr', 'booked_revenue', 'conversions',
                'rpm', 'rpa_adv_curr'], 
               inplace=True, axis=1)

## aggregate columns

In [41]:
agg_scores = base_data.groupby(['postal_code', 'line_item_id']).agg({'impressions': 'sum', 'ctr': 'mean', 'device_score': 'mean',
                                     'time_of_day_score': 'mean'})

In [42]:
agg_scores.reset_index(inplace=True)

## add demograaphic data

In [43]:
agg_scores_plus_dem = pd.merge(agg_scores, dem_data.drop(['ctr'], axis=1), left_on='postal_code', right_on='postal_code', how='inner')

In [44]:
agg_scores_plus_dem

Unnamed: 0.1,postal_code,line_item_id,impressions,ctr,device_score,time_of_day_score,Unnamed: 0,Estimate!!SEX AND AGE!!Total population!!65 years and over!!Sex ratio (males per 100 females),Estimate!!RACE!!Total population,Estimate!!RACE!!Total population!!Two or more races,...,Estimate!!Race alone or in combination with one or more other races!!Total population!!Native Hawaiian and Other Pacific Islander,Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)!!Puerto Rican,Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)!!Cuban,Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)!!Other Hispanic or Latino,Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Some other race alone,Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Two or more races!!Two races including Some other race,Estimate!!SEX AND AGE!!Total population!!Sex ratio (males per 100 females),Estimate!!SEX AND AGE!!Total population!!15 to 19 years,Estimate!!SEX AND AGE!!Total population!!20 to 24 years,Estimate!!SEX AND AGE!!Total population!!Median age (years)
0,48001,13917115,739,0.000000,99.108171,0.296010,1,87.0,11885.0,44.0,...,12.0,1.0,0.0,37.0,0.0,0.0,103.8,507.0,547.0,48.3
1,48001,13917143,80,0.000000,60.918521,0.283020,1,87.0,11885.0,44.0,...,12.0,1.0,0.0,37.0,0.0,0.0,103.8,507.0,547.0,48.3
2,48001,13917188,416,0.002033,69.523048,0.291273,1,87.0,11885.0,44.0,...,12.0,1.0,0.0,37.0,0.0,0.0,103.8,507.0,547.0,48.3
3,48001,13917210,696,0.001330,71.987135,0.307439,1,87.0,11885.0,44.0,...,12.0,1.0,0.0,37.0,0.0,0.0,103.8,507.0,547.0,48.3
4,48001,13917281,58,0.000000,59.259259,0.304817,1,87.0,11885.0,44.0,...,12.0,1.0,0.0,37.0,0.0,0.0,103.8,507.0,547.0,48.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34821,49971,14951140,435,0.000000,55.656377,0.286403,1047,111.9,443.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,117.2,20.0,23.0,59.0
34822,49971,15269657,2103,0.001170,99.016044,0.295361,1047,111.9,443.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,117.2,20.0,23.0,59.0
34823,49971,15269658,2551,0.000000,64.863869,0.279029,1047,111.9,443.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,117.2,20.0,23.0,59.0
34824,49971,15625466,46,0.000000,84.615385,0.312838,1047,111.9,443.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,117.2,20.0,23.0,59.0


In [45]:
agg_scores_full = pd.merge(agg_scores_plus_dem, emp_data.drop(['ctr'], axis=1), 
                           left_on='postal_code', right_on='postal_code', how='inner')

In [46]:
agg_scores_full = agg_scores_full.loc[:, ~agg_scores_full.columns.str.contains('^Unnamed')]
agg_scores_full

Unnamed: 0,postal_code,line_item_id,impressions,ctr,device_score,time_of_day_score,Estimate!!SEX AND AGE!!Total population!!65 years and over!!Sex ratio (males per 100 females),Estimate!!RACE!!Total population,Estimate!!RACE!!Total population!!Two or more races,Estimate!!RACE!!Total population!!One race!!Black or African American,...,"Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Sales and office occupations!!Sales and related occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Sales and office occupations!!Office and administrative support occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Natural resources, construction, and maintenance occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Natural resources, construction, and maintenance occupations!!Farming, fishing, and forestry occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Natural resources, construction, and maintenance occupations!!Construction and extraction occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Natural resources, construction, and maintenance occupations!!Installation, maintenance, and repair occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Production, transportation, and material moving occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Production, transportation, and material moving occupations!!Production occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Production, transportation, and material moving occupations!!Transportation occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Production, transportation, and material moving occupations!!Material moving occupations"
0,48001,13917115,739,0.000000,99.108171,0.296010,87.0,11885.0,44.0,7.0,...,34.1,73.3,5.6,0.0,1.7,13.7,21.6,22.8,0.0,26.6
1,48001,13917143,80,0.000000,60.918521,0.283020,87.0,11885.0,44.0,7.0,...,34.1,73.3,5.6,0.0,1.7,13.7,21.6,22.8,0.0,26.6
2,48001,13917188,416,0.002033,69.523048,0.291273,87.0,11885.0,44.0,7.0,...,34.1,73.3,5.6,0.0,1.7,13.7,21.6,22.8,0.0,26.6
3,48001,13917210,696,0.001330,71.987135,0.307439,87.0,11885.0,44.0,7.0,...,34.1,73.3,5.6,0.0,1.7,13.7,21.6,22.8,0.0,26.6
4,48001,13917281,58,0.000000,59.259259,0.304817,87.0,11885.0,44.0,7.0,...,34.1,73.3,5.6,0.0,1.7,13.7,21.6,22.8,0.0,26.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34821,49971,14951140,435,0.000000,55.656377,0.286403,111.9,443.0,0.0,0.0,...,50.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34822,49971,15269657,2103,0.001170,99.016044,0.295361,111.9,443.0,0.0,0.0,...,50.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34823,49971,15269658,2551,0.000000,64.863869,0.279029,111.9,443.0,0.0,0.0,...,50.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34824,49971,15625466,46,0.000000,84.615385,0.312838,111.9,443.0,0.0,0.0,...,50.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## rescale numeric data

In [47]:
agg_scores_full.drop('postal_code', axis=1, inplace=True)

In [48]:
scaler = StandardScaler()
agg_scores_full[agg_scores_full.drop(['ctr', 'impressions', 'line_item_id'], axis=1).columns] = scaler.fit_transform(agg_scores_full.drop(['ctr', 'impressions','line_item_id'], axis=1))


In [49]:
agg_scores_full

Unnamed: 0,line_item_id,impressions,ctr,device_score,time_of_day_score,Estimate!!SEX AND AGE!!Total population!!65 years and over!!Sex ratio (males per 100 females),Estimate!!RACE!!Total population,Estimate!!RACE!!Total population!!Two or more races,Estimate!!RACE!!Total population!!One race!!Black or African American,Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native,...,"Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Sales and office occupations!!Sales and related occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Sales and office occupations!!Office and administrative support occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Natural resources, construction, and maintenance occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Natural resources, construction, and maintenance occupations!!Farming, fishing, and forestry occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Natural resources, construction, and maintenance occupations!!Construction and extraction occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Natural resources, construction, and maintenance occupations!!Installation, maintenance, and repair occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Production, transportation, and material moving occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Production, transportation, and material moving occupations!!Production occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Production, transportation, and material moving occupations!!Transportation occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Production, transportation, and material moving occupations!!Material moving occupations"
0,13917115,739,0.000000,1.222639,-0.000856,0.019635,0.066633,-0.568713,-0.319311,-0.010761,...,-0.279876,-0.026814,0.279487,-0.44573,-0.133003,1.894039,0.151597,0.015915,-0.665735,0.292466
1,13917143,80,0.000000,-1.079867,-0.254971,0.019635,0.066633,-0.568713,-0.319311,-0.010761,...,-0.279876,-0.026814,0.279487,-0.44573,-0.133003,1.894039,0.151597,0.015915,-0.665735,0.292466
2,13917188,416,0.002033,-0.561089,-0.093528,0.019635,0.066633,-0.568713,-0.319311,-0.010761,...,-0.279876,-0.026814,0.279487,-0.44573,-0.133003,1.894039,0.151597,0.015915,-0.665735,0.292466
3,13917210,696,0.001330,-0.412525,0.222742,0.019635,0.066633,-0.568713,-0.319311,-0.010761,...,-0.279876,-0.026814,0.279487,-0.44573,-0.133003,1.894039,0.151597,0.015915,-0.665735,0.292466
4,13917281,58,0.000000,-1.179906,0.171441,0.019635,0.066633,-0.568713,-0.319311,-0.010761,...,-0.279876,-0.026814,0.279487,-0.44573,-0.133003,1.894039,0.151597,0.015915,-0.665735,0.292466
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34821,14951140,435,0.000000,-1.397129,-0.188801,0.826756,-0.852273,-0.660513,-0.320756,-0.359004,...,0.470773,1.177112,-0.583726,-0.44573,-0.376215,-0.440754,-1.617513,-1.433376,-0.665735,-0.994446
34822,15269657,2103,0.001170,1.217085,-0.013549,0.826756,-0.852273,-0.660513,-0.320756,-0.359004,...,0.470773,1.177112,-0.583726,-0.44573,-0.376215,-0.440754,-1.617513,-1.433376,-0.665735,-0.994446
34823,15269658,2551,0.000000,-0.841997,-0.333067,0.826756,-0.852273,-0.660513,-0.320756,-0.359004,...,0.470773,1.177112,-0.583726,-0.44573,-0.376215,-0.440754,-1.617513,-1.433376,-0.665735,-0.994446
34824,15625466,46,0.000000,0.348849,0.328375,0.826756,-0.852273,-0.660513,-0.320756,-0.359004,...,0.470773,1.177112,-0.583726,-0.44573,-0.376215,-0.440754,-1.617513,-1.433376,-0.665735,-0.994446


## one hot encode line item id

In [50]:
agg_scores_full['line_item_id'].value_counts()

15269658    1034
13917402    1034
14602750    1033
13934058    1033
14951140    1032
15269657    1032
14351600    1023
14570229    1010
14950582    1002
14351599    1002
16344147    1002
14345556     998
13917210     998
13971302     997
13935850     996
13917704     992
13917415     991
13917115     983
14273478     981
13917188     979
14788453     969
13917317     961
15625466     952
14004082     943
14345602     929
14351602     902
13917143     897
13917281     818
13917562     817
13917702     792
14004079     701
14004080     689
16423703     671
14788452     655
13917624     654
13917701     588
13917703     543
13917595     504
14788454     463
14788451     226
Name: line_item_id, dtype: int64

In [51]:
agg_scores_full = agg_scores_full[agg_scores_full['line_item_id'] == 15269658]

In [52]:
#one_hot = pd.get_dummies(agg_scores_full['line_item_id'])
#agg_scores_full = agg_scores_full.drop('line_item_id',axis = 1)
#agg_scores_full = agg_scores_full.join(one_hot)
#agg_scores_full

## hurdle model part 1 classification

### Train classifier to reduce the number of 0 values
#### split into train, test and validation set

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
import xgboost as xgb

In [54]:
agg_scores_full.loc[agg_scores_full['ctr'] > 0, 'strat_column'] = 1
agg_scores_full.loc[agg_scores_full['ctr'] <= 0, 'strat_column'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [55]:
agg_scores_full

Unnamed: 0,line_item_id,impressions,ctr,device_score,time_of_day_score,Estimate!!SEX AND AGE!!Total population!!65 years and over!!Sex ratio (males per 100 females),Estimate!!RACE!!Total population,Estimate!!RACE!!Total population!!Two or more races,Estimate!!RACE!!Total population!!One race!!Black or African American,Estimate!!RACE!!Total population!!One race!!American Indian and Alaska Native,...,"Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Sales and office occupations!!Office and administrative support occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Natural resources, construction, and maintenance occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Natural resources, construction, and maintenance occupations!!Farming, fishing, and forestry occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Natural resources, construction, and maintenance occupations!!Construction and extraction occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Natural resources, construction, and maintenance occupations!!Installation, maintenance, and repair occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Production, transportation, and material moving occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Production, transportation, and material moving occupations!!Production occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Production, transportation, and material moving occupations!!Transportation occupations","Estimate!!Percent Female!!Full-time, year-round civilian employed population 16 years and over!!Production, transportation, and material moving occupations!!Material moving occupations",strat_column
34,15269658,39865,0.000620,-0.382208,-0.174928,0.019635,0.066633,-0.568713,-0.319311,-0.010761,...,-0.026814,0.279487,-0.445730,-0.133003,1.894039,0.151597,0.015915,-0.665735,0.292466,1.0
66,15269658,7286,0.000000,-0.478671,-0.225354,0.087705,-0.628128,-0.547849,-0.320756,-0.400793,...,0.748749,0.325730,-0.445730,-0.376215,2.439392,0.200739,-0.073077,2.474168,-0.147793,0.0
100,15269658,17058,0.000121,-0.480198,-0.255685,0.716546,-0.386716,-0.595835,-0.320344,-0.345074,...,0.018277,-0.275436,0.998352,-0.376215,-0.440754,-0.118684,-0.842218,1.163815,1.061711,1.0
139,15269658,19650,0.000284,-0.532602,-0.274508,-0.369339,-0.446146,-0.531158,-0.313529,-0.289356,...,0.730713,0.017440,3.599317,-0.376215,-0.440754,-0.118684,-0.587956,-0.665735,1.463266,1.0
176,15269658,9309,0.001492,-0.464851,-0.281788,0.308124,-0.553761,-0.433099,-0.317246,-0.268461,...,0.266277,-0.383337,-0.445730,-0.376215,-0.048781,0.045123,0.022271,-0.665735,0.050566,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34701,15269658,2380,0.000932,-0.792399,-0.578587,0.800824,-0.850185,-0.656340,-0.320756,-0.338110,...,0.577404,-0.583726,-0.445730,-0.376215,-0.440754,0.724920,0.155759,-0.665735,1.424562,1.0
34739,15269658,10987,0.000000,-0.462462,-0.204172,-0.330441,-0.732932,-0.616699,-0.316214,-0.379899,...,0.257258,-0.583726,-0.445730,-0.376215,-0.440754,-0.954097,-0.390904,-0.665735,-0.994446,0.0
34772,15269658,11059,0.000429,-0.572571,-0.348480,0.291917,-0.785776,-0.610440,-0.318279,1.758315,...,-0.270305,1.404745,0.711154,-0.376215,-0.440754,-1.617513,-1.433376,-0.665735,-0.994446,1.0
34794,15269658,884,0.000000,-1.259981,-0.705785,1.231937,-0.864721,-0.658426,-0.320756,-0.400793,...,-3.331974,-0.583726,-0.445730,-0.376215,-0.440754,-0.446297,0.473586,-0.665735,-0.994446,0.0


In [56]:
X_train, X_test, y_train, y_test = train_test_split(agg_scores_full.drop(
    ['ctr', 'impressions', 'strat_column'],axis = 1), agg_scores_full['ctr'], test_size=0.25, random_state=42, stratify=agg_scores_full['strat_column'])

## train classification model 

In [57]:
model = LogisticRegression(class_weight={0: np.count_nonzero(agg_scores_full['strat_column'] == 0) / len(agg_scores_full['strat_column']),
                                             1: np.count_nonzero(agg_scores_full['strat_column'] != 0) / len(agg_scores_full['strat_column'])})

In [58]:
grid = {
        'min_child_weight': [1, 10],
        'gamma': [0.5, 1, 1.5],
        'subsample': [0.6, 0.8],
        'colsample_bytree': [0.6],
        'max_depth': [1,3]
        }

In [59]:
xgb_model = xgb.XGBClassifier(learning_rate=0.02, n_estimators=50, objective='binary:logistic')

In [None]:
kfold = RepeatedKFold(n_splits=3, random_state = 42)

search = GridSearchCV(xgb_model, grid, scoring='recall', cv=kfold, n_jobs=-1)

eval_set = (X_test, y_test)

search.fit(X_train, y_train, verbose=True)
