In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from category_encoders import OrdinalEncoder
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
df = pd.read_csv('../data/bikeshare.csv', parse_dates=['datetime'])

In [3]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 00:00:00,Spring,0,0,Clear Skies,9.84,14.395,81,0.0,16
1,2011-01-01 01:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,40
2,2011-01-01 02:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,32
3,2011-01-01 03:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,13
4,2011-01-01 04:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,1


In [24]:
df['hour'] = df['datetime'].dt.hour

In [4]:
pipe = make_pipeline(OrdinalEncoder(), GradientBoostingRegressor())

In [25]:
X = df.drop(['count', 'datetime'], axis=1)
y = df['count']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

In [7]:
X_test.tail()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
10881,2012-12-19 19:00:00,Winter,0,1,Clear Skies,15.58,19.695,50,26.0027
10882,2012-12-19 20:00:00,Winter,0,1,Clear Skies,14.76,17.425,57,15.0013
10883,2012-12-19 21:00:00,Winter,0,1,Clear Skies,13.94,15.91,61,15.0013
10884,2012-12-19 22:00:00,Winter,0,1,Clear Skies,13.94,17.425,61,6.0032
10885,2012-12-19 23:00:00,Winter,0,1,Clear Skies,13.12,16.665,66,8.9981


In [8]:
X_train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-01 00:00:00,Spring,0,0,Clear Skies,9.84,14.395,81,0.0
1,2011-01-01 01:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0
2,2011-01-01 02:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0
3,2011-01-01 03:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0
4,2011-01-01 04:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0


In [27]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, shuffle=False, test_size=0.2)

In [28]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['season', 'weather'],
                                drop_invariant=False, handle_missing='value',
                                handle_unknown='value',
                                mapping=[{'col': 'season',
                                          'data_type': dtype('O'),
                                          'mapping': Spring    1
Summer    2
Fall      3
Winter    4
NaN      -2
dtype: int64},
                                         {'col': 'weather',
                                          'data_type': dtype('O'),
                                          'mapping': Clear Skies          1
Partly Cloudy        2
Light Storms/Rain    3
Hea...
                                           learning_rate=0.1, loss='ls',
                                           max_depth=3, max_features=None,
                                           max_leaf_nodes=None,
                           

In [18]:
OrdinalEncoder().fit_transform(X_train)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,1,0,0,1,9.84,14.395,81,0.0000
1,1,0,0,1,9.02,13.635,80,0.0000
2,1,0,0,1,9.02,13.635,80,0.0000
3,1,0,0,1,9.84,14.395,75,0.0000
4,1,0,0,1,9.84,14.395,75,0.0000
...,...,...,...,...,...,...,...,...
6961,2,0,0,1,18.04,21.970,26,15.0013
6962,2,0,0,1,20.50,24.240,25,15.0013
6963,2,0,0,1,20.50,24.240,25,19.0012
6964,2,0,0,1,22.14,25.760,24,16.9979


In [29]:
pipe.score(X_val, y_val)

0.6269555891884147

In [23]:
cross_val_score(estimator=pipe, X=X_train, y=y_train, cv=10)

array([ 0.03522124, -1.69049762,  0.16279571,  0.13616402,  0.24149235,
        0.23107303,  0.26061045,  0.23318406,  0.19504356,  0.16987976])

In [34]:
pipe.steps[1][1].set_params()

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [36]:
max_depth = [3, 4, 5]
num_trees = [100, 250, 500]
cv_scores = []

for depth in max_depth:
    for tree in num_trees:
        pipe.steps[1][1].set_params(n_estimators=tree, max_depth=depth)
        pipe.fit(X_train, y_train)
        val_score = pipe.score(X_val, y_val)
        cv_dict = {'score': val_score, 'max_depth': depth, 'n_estimators': tree}
        cv_scores.append(cv_dict)

max_params = max(cv_scores, key=lambda x: x['score'])
pipe.steps[1][1].set_params(max_depth=max_params['max_depth'], n_estimators=max_params['n_estimators'])
pipe.fit(X_train, y_train)

0.6269133458248184 3 100
0.6772483941175602 3 250
0.6856282145111079 3 500
0.6726237746278347 4 100
0.692995813678542 4 250
0.6967242514224632 4 500
0.695383675337404 5 100
0.69947327255854 5 250
0.6994645232441364 5 500


In [41]:
X_train = pd.concat([X_train, X_val])

In [42]:
y_train = pd.concat([y_train, y_val])

In [43]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['season', 'weather'],
                                drop_invariant=False, handle_missing='value',
                                handle_unknown='value',
                                mapping=[{'col': 'season',
                                          'data_type': dtype('O'),
                                          'mapping': Spring    1
Summer    2
Fall      3
Winter    4
NaN      -2
dtype: int64},
                                         {'col': 'weather',
                                          'data_type': dtype('O'),
                                          'mapping': Clear Skies          1
Partly Cloudy        2
Light Storms/Rain    3
Hea...
                                           learning_rate=0.1, loss='ls',
                                           max_depth=5, max_features=None,
                                           max_leaf_nodes=None,
                           

In [38]:
max(cv_scores, key=lambda x: x['score'])

{'score': 0.69947327255854, 'max_depth': 5, 'n_estimators': 250}

In [44]:
pipe.score(X_test, y_test)

0.6845302505757571

In [45]:
df = pd.read_csv('../data/ks2.csv', encoding='utf-8', parse_dates=['launched', 'deadline'])

In [48]:
cat_avgs = df.groupby('category')[['goal']].mean().rename({'goal': 'cat_avg_goal'}, axis=1)

In [49]:
cat_avgs

Unnamed: 0_level_0,cat_avg_goal
category,Unnamed: 1_level_1
3D Printing,47096.558771
Academic,55365.812915
Accessories,14737.425295
Action,320007.524411
Animals,10788.377328
...,...
Woodworking,11655.184791
Workshops,9739.618038
World Music,28711.839405
Young Adult,40515.781588


In [50]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,launched,state,country,goal
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,failed,GB,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,2017-09-02 04:43:57,failed,US,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,2013-01-12 00:20:50,failed,US,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,2012-03-17 03:24:11,failed,US,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,2015-07-04 08:35:03,failed,US,19500.0


In [52]:
df = df.merge(cat_avgs, left_on='category', right_index=True)

In [53]:
df['cat_goal_pct'] = df['goal'] / df['cat_avg_goal']

In [54]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,launched,state,country,goal,cat_avg_goal,cat_goal_pct
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,failed,GB,1533.95,5213.996468,0.294199
232,1001028167,Steel Cathedrals- Short poems for the digital ...,Poetry,Publishing,GBP,2013-07-28,2013-06-28 23:17:04,failed,GB,6060.97,5213.996468,1.162442
318,1001468086,Bass River Press,Poetry,Publishing,USD,2015-04-05,2015-03-06 19:58:58,failed,US,2000.0,5213.996468,0.383583
414,1001991458,Poems For Apostates & tales of a young Sciento...,Poetry,Publishing,USD,2014-05-31,2014-05-01 17:47:49,failed,US,10000.0,5213.996468,1.917915
517,1002519316,"Your poem, by me.",Poetry,Publishing,CAD,2015-10-31,2015-10-01 20:00:32,failed,CA,757.52,5213.996468,0.145286


In [55]:
main_cats = df.groupby('main_category')[['goal']].mean().reset_index().rename({'goal': 'main_cat_goal_avg'}, axis=1)

In [56]:
df = df.merge(main_cats, on='main_category')

In [57]:
df['main_goal_pct'] = df['goal'] / df['main_cat_goal_avg']

In [58]:
df[['goal', 'main_cat_goal_avg', 'main_goal_pct']].head()

Unnamed: 0,goal,main_cat_goal_avg,main_goal_pct
0,1533.95,22590.745149,0.067902
1,6060.97,22590.745149,0.268294
2,2000.0,22590.745149,0.088532
3,10000.0,22590.745149,0.442659
4,757.52,22590.745149,0.033532


In [65]:
X = df.drop(['deadline', 'launched', 'state'], axis=1)
y = df['state']

In [64]:
df = extract_dates(df)

In [70]:
scores = get_val_scores(pipe, X, y, random_state=1985, stratify=True, use_kfold=False)

In [63]:
from utils import get_val_scores, extract_dates

In [69]:
pipe = make_pipeline(OrdinalEncoder(), xgb.XGBClassifier())

In [68]:
import xgboost as xgb

In [71]:
scores

{'validation_score': 0.6918833195552782}

In [72]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,launched,state,country,goal,...,launched_quarter,launched_week,launched_weekofyear,launched_day,launched_hour,launched_minute,launched_month,launched_year,deadline_history_days,launched_history_days
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,failed,GB,1533.95,...,3,33,33,11,12,12,8,2015,2350,16658
1,1001028167,Steel Cathedrals- Short poems for the digital ...,Poetry,Publishing,GBP,2013-07-28,2013-06-28 23:17:04,failed,GB,6060.97,...,2,26,26,28,23,17,6,2013,1547,15884
2,1001468086,Bass River Press,Poetry,Publishing,USD,2015-04-05,2015-03-06 19:58:58,failed,US,2000.0,...,1,10,10,6,19,58,3,2015,2163,16500
3,1001991458,Poems For Apostates & tales of a young Sciento...,Poetry,Publishing,USD,2014-05-31,2014-05-01 17:47:49,failed,US,10000.0,...,2,18,18,1,17,47,5,2014,1854,16191
4,1002519316,"Your poem, by me.",Poetry,Publishing,CAD,2015-10-31,2015-10-01 20:00:32,failed,CA,757.52,...,4,40,40,1,20,0,10,2015,2372,16709


In [76]:
weekly_totals = df.groupby(['launched_year', 'launched_weekofyear'])[['ID']].count().reset_index().rename({'ID': 'Weekly_Total_Count'}, axis=1)

In [77]:
weekly_totals.head()

Unnamed: 0,launched_year,launched_weekofyear,Weekly_Total_Count
0,1970,1,6
1,2009,17,4
2,2009,18,22
3,2009,19,17
4,2009,20,23


In [78]:
df.columns

Index(['ID', 'name', 'category', 'main_category', 'currency', 'deadline',
       'launched', 'state', 'country', 'goal', 'cat_avg_goal', 'cat_goal_pct',
       'main_cat_goal_avg', 'main_goal_pct', 'deadline_dayofweek',
       'deadline_dayofyear', 'deadline_days_in_month', 'deadline_is_leap_year',
       'deadline_is_month_end', 'deadline_is_month_start',
       'deadline_is_quarter_end', 'deadline_is_quarter_start',
       'deadline_is_year_end', 'deadline_is_year_start', 'deadline_quarter',
       'deadline_week', 'deadline_weekofyear', 'deadline_day', 'deadline_hour',
       'deadline_minute', 'deadline_month', 'deadline_year',
       'launched_dayofweek', 'launched_dayofyear', 'launched_days_in_month',
       'launched_is_leap_year', 'launched_is_month_end',
       'launched_is_month_start', 'launched_is_quarter_end',
       'launched_is_quarter_start', 'launched_is_year_end',
       'launched_is_year_start', 'launched_quarter', 'launched_week',
       'launched_weekofyear', 'laun

In [80]:
df = df.merge(weekly_totals, on=['launched_year', 'launched_weekofyear'])

In [81]:
X = df.drop(['deadline', 'launched', 'state'], axis=1)
y = df['state']