In [79]:
#Predict if a Kickstarter project will succeed.
#Get the outcome from the state column. 

In [80]:
import pandas as pd
ks = pd.read_csv('Feature_Engineering_Kickstarter_Data.csv',parse_dates=['deadline', 'launched'])
ks.head(10)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0
6,1000023410,Support Solar Roasted Coffee & Green Energy! ...,Food,Food,USD,2014-12-21,1000.0,2014-12-01 18:30:44,1205.0,successful,16,US,1205.0,1205.0,1000.0
7,1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Drinks,Food,USD,2016-03-17,25000.0,2016-02-01 20:05:12,453.0,failed,40,US,453.0,453.0,25000.0
8,1000034518,SPIN - Premium Retractable In-Ear Headphones w...,Product Design,Design,USD,2014-05-29,125000.0,2014-04-24 18:14:43,8233.0,canceled,58,US,8233.0,8233.0,125000.0
9,100004195,STUDIO IN THE SKY - A Documentary Feature Film...,Documentary,Film & Video,USD,2014-08-10,65000.0,2014-07-11 21:55:48,6240.57,canceled,43,US,6240.57,6240.57,65000.0


In [81]:
pd.unique(ks.state)

array(['failed', 'canceled', 'successful', 'live', 'undefined',
       'suspended'], dtype=object)

In [82]:
ks.groupby('state')['ID'].count()

state
canceled       38779
failed        197719
live            2799
successful    133956
suspended       1846
undefined       3562
Name: ID, dtype: int64

In [83]:
#Data cleaning isn't the current focus, so we'll simplify this example by:

#Dropping projects that are "live"
#Counting "successful" states as outcome = 1
#Combining every other state as outcome = 0


ks = ks.loc[~(ks.state == 'live')]
ks.groupby('state')['ID'].count()

state
canceled       38779
failed        197719
successful    133956
suspended       1846
undefined       3562
Name: ID, dtype: int64

In [84]:
# Add outcome column, "successful" == 1, others are 0
ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

In [85]:
ks['outcome_1'] = (ks.state == 'successful').astype(int)

In [86]:
ks.groupby('outcome')['ID'].count()

outcome
0    241906
1    133956
Name: ID, dtype: int64

In [87]:
ks.groupby('outcome_1')['ID'].count()

outcome_1
0    241906
1    133956
Name: ID, dtype: int64

In [88]:
#access date and time values through the .dt attribute on the timestamp column.
ks = ks.assign(hour=ks.launched.dt.hour,
               day=ks.launched.dt.day,
               month=ks.launched.dt.month,
               year=ks.launched.dt.year)

ks.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,...,country,usd pledged,usd_pledged_real,usd_goal_real,outcome,outcome_1,hour,day,month,year
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,...,GB,0.0,0.0,1533.95,0,0,12,11,8,2015
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,...,US,100.0,2421.0,30000.0,0,0,4,2,9,2017
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,...,US,220.0,220.0,45000.0,0,0,0,12,1,2013
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,...,US,1.0,1.0,5000.0,0,0,3,17,3,2012
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,...,US,1283.0,1283.0,19500.0,0,0,8,4,7,2015


In [89]:
#to find out which columns are of categorical type
s = (ks.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['name', 'category', 'main_category', 'currency', 'state', 'country']


In [90]:
from sklearn.preprocessing import LabelEncoder

cat_features = ['category', 'currency', 'country']
encoder = LabelEncoder()

# Apply the label encoder to each column
encoded = ks[cat_features].apply(encoder.fit_transform)
encoded.head(10)

Unnamed: 0,category,currency,country
0,108,5,9
1,93,13,22
2,93,13,22
3,90,13,22
4,55,13,22
5,123,13,22
6,58,13,22
7,41,13,22
8,113,13,22
9,39,13,22


In [91]:
data = ks[['goal', 'hour', 'day', 'month', 'year', 'outcome']].join(encoded)
data.head()

Unnamed: 0,goal,hour,day,month,year,outcome,category,currency,country
0,1000.0,12,11,8,2015,0,108,5,9
1,30000.0,4,2,9,2017,0,93,13,22
2,45000.0,0,12,1,2013,0,93,13,22
3,5000.0,3,17,3,2012,0,90,13,22
4,19500.0,8,4,7,2015,0,55,13,22


In [92]:
#Create data sets for training, validation, and testing. Split the data using slices. 
#Use 10% of the data as a validation set, 10% for testing, and the other 80% for training.


#Since our model is meant to predict events in the future, we must also validate the model on events in the future.
#If the data is mixed up between the training and test sets, 
#then future data will leak in to the model and our validation results will overestimate the performance on new data.

valid_fraction = 0.1
valid_size = int(len(data) * valid_fraction)

train = data[:-2 * valid_size]
valid = data[-2 * valid_size:-valid_size]
test = data[-valid_size:]

In [93]:
#Make sure that each data set has the same proportion of target classes. 
for each in [train, valid, test]:
    print(f"Outcome fraction = {each.outcome.mean():.4f}")

Outcome fraction = 0.3570
Outcome fraction = 0.3539
Outcome fraction = 0.3542


In [94]:
%%time
#Cell magics %% should start from the first line by convention.

#Training a LightGBM model
# This is a tree-based model that typically provides the best performance, even compared to XGBoost. 
#It's also relatively fast to train

#LightGBM models work with label encoded features, don't actually need to one-hot encode the categorical features.

import lightgbm as lgb

feature_cols = train.columns.drop('outcome')

dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=20)

Training until validation scores don't improve for 10 rounds
[20]	valid_0's auc: 0.72501
[40]	valid_0's auc: 0.73744
[60]	valid_0's auc: 0.741632
[80]	valid_0's auc: 0.743558
[100]	valid_0's auc: 0.74473
[120]	valid_0's auc: 0.74634
[140]	valid_0's auc: 0.746899
[160]	valid_0's auc: 0.747479
[180]	valid_0's auc: 0.747528
Early stopping, best iteration is:
[171]	valid_0's auc: 0.747591
Wall time: 5.13 s


In [95]:
from sklearn import metrics
ypred = bst.predict(test[feature_cols])
score = metrics.roc_auc_score(test['outcome'], ypred)

print(f"Test AUC score: {score}")

Test AUC score: 0.747615303004287


In [96]:
ypred

array([0.46018401, 0.49095702, 0.42567723, ..., 0.35852514, 0.22526392,
       0.53888705])

In [97]:
def get_data_splits(dataframe, valid_fraction=0.1):
    valid_size = int(len(dataframe) * valid_fraction)

    train = dataframe[:-valid_size * 2]
    # valid size == test size, last two sections of the data
    valid = dataframe[-valid_size * 2:-valid_size]
    test = dataframe[-valid_size:]
    
    return train, valid, test


In [98]:
def train_model(train, valid):
    feature_cols = train.columns.drop('outcome')

    dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

    param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
    
    print("Training model!")
    bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid], 
                    early_stopping_rounds=10, verbose_eval=False)

    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
    print(f"Validation AUC score: {valid_score:.4f}")
    
    return bst

In [99]:
baseline_data = data

train, valid, _ = get_data_splits(baseline_data)
bst = train_model(train, valid)

Training model!
Validation AUC score: 0.7467


In [75]:
#Count Encoding
#Count encoding replaces each categorical value with the number of times it appears in the dataset. 
#For example, if the value "GB" occured 10 times in the country feature, then each "GB" would be replaced with the number 10.

#Why is count encoding effective?
# Rare values tend to have similar counts (with values like 1 or 2), can classify rare values together at prediction time. 
#Common values with large counts are unlikely to have the same exact count as other values. 
#So, the common/important values get their own grouping.


import category_encoders as ce
cat_features = ['category', 'currency', 'country']
train, valid, test = get_data_splits(baseline_data)

# Create the count encoder
count_enc = ce.CountEncoder(cols=cat_features)

# Learn encoding from the training set
count_enc.fit(train[cat_features])

# Apply encoding to the train and validation sets
train_encoded = train.join(count_enc.transform(train[cat_features]).add_suffix('_count'))
valid_encoded = valid.join(count_enc.transform(valid[cat_features]).add_suffix('_count'))

# Training a model on the baseline data
bst = train_model(train_encoded, valid_encoded)

  X.loc[:, self.cols] = X.fillna(value=pd.np.nan)


Training model!
Validation AUC score: 0.7491


In [100]:
#difference between fit and fit_transform
#transform replaces the missing values with a number. 
#By default this number is the means of columns of some data that you choose.
#Thus we get different results in both cases

import category_encoders as ce
cat_features = ['category', 'currency', 'country']
count_enc = ce.CountEncoder()
count_encoded = count_enc.fit_transform(ks[cat_features])

data = baseline_data.join(count_encoded.add_suffix("_count"))

# Training a model on the baseline data
train, valid, test = get_data_splits(data)
bst = train_model(train, valid)

  X.loc[:, self.cols] = X.fillna(value=pd.np.nan)


Training model!
Validation AUC score: 0.7486


In [101]:
#Target encoding 
#Target Encoding replaces a categorical value with the average value of the target for that value of the feature. 
#For example, given the country value "CA", the average outcome for all the rows with country == 'CA', around 0.28.

#This technique uses the targets to create new features. 
#So including the validation or test data in the target encodings would be a form of target leakage. 
#Instead, learn the target encodings from the training dataset only and apply it to the other datasets.

#Target encoding attempts to measure the population mean of the target for each level in a categorical feature. 
#This means when there is less data per level, the estimated mean will be further away from the "true" mean, 
#there will be more variance. 
#There is little data per IP address so it's likely that the estimates are much noisier than for the other features. 
#The model will rely heavily on this feature since it is extremely predictive. 
#So, the model will perform very poorly when seeing new IP addresses that weren't in the training data 
#(which is likely most new data). Going forward, we'll leave out the IP feature when trying different encodings.

import category_encoders as ce
cat_features = ['category', 'currency', 'country']

# Create the encoder itself
target_enc = ce.TargetEncoder(cols=cat_features)

train, valid, _ = get_data_splits(data)

# Fit the encoder using the categorical features and target
target_enc.fit(train[cat_features], train['outcome'])

# Transform the features, rename the columns with _target suffix, and join to dataframe
train = train.join(target_enc.transform(train[cat_features]).add_suffix('_target'))
valid = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_target'))

train.head()
bst = train_model(train, valid)

Training model!
Validation AUC score: 0.7491


In [102]:
#CatBoost Encoding
#CatBoost encoding is similar to target encoding in that it's based on the target probablity for a given value. 
#However with CatBoost, for each row, the target probability is calculated only from the rows before it.

#This technique uses the targets to create new features. 
#So including the validation or test data in the target encodings would be a form of target leakage. 
#Instead, learn the target encodings from the training dataset only and apply it to the other datasets.

cat_features = ['category', 'currency', 'country']
cb_enc = ce.CatBoostEncoder(cols=cat_features)

train, valid, _ = get_data_splits(data)
cb_enc.fit(train[cat_features], train['outcome'])

train = train.join(cb_enc.transform(train[cat_features]).add_suffix('_cb'))
valid = valid.join(cb_enc.transform(valid[cat_features]).add_suffix('_cb'))

bst = train_model(train, valid)

Training model!
Validation AUC score: 0.7492
