# Feature Engineering - Baseline Model

Create baseline model to improve performance as we do feature engineering.

In [1]:
import os
os.getcwd()

'/home/raxit/kaggle'

In [2]:
import pandas as pd

# Read Kickstarter projects data
ks = pd.read_csv('./dataset/kickstarter_project/ks-projects-201801.csv',
                parse_dates=['deadline', 'launched'])

In [3]:
ks.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [4]:
ks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
ID                  378661 non-null int64
name                378657 non-null object
category            378661 non-null object
main_category       378661 non-null object
currency            378661 non-null object
deadline            378661 non-null datetime64[ns]
goal                378661 non-null float64
launched            378661 non-null datetime64[ns]
pledged             378661 non-null float64
state               378661 non-null object
backers             378661 non-null int64
country             378661 non-null object
usd pledged         374864 non-null float64
usd_pledged_real    378661 non-null float64
usd_goal_real       378661 non-null float64
dtypes: datetime64[ns](2), float64(5), int64(2), object(6)
memory usage: 43.3+ MB


In the Kickstarter Dataset we can predict if a kickstarter project will succeed. this is given by the outcome of the `state` column.

### Preparing Target column

Take a look at the unique values in column `state`

In [5]:
pd.unique(ks.state)

array(['failed', 'canceled', 'successful', 'live', 'undefined',
       'suspended'], dtype=object)

There are six states. `failed`, `canceled`, `successful`, `live`, `undefined` and `suspended`. Let's take a count of these states.

In [6]:
# Counting states

ks.groupby('state')['ID'].count()

state
canceled       38779
failed        197719
live            2799
successful    133956
suspended       1846
undefined       3562
Name: ID, dtype: int64

Since we are predicting if a project is going to be successful or not, 
- drop pojects that are "live"
- Count "successful" states as `outcome = 1`
- Combine every other state as `outcome = 0`

Since data cleaning is not the focus we will deal with it later.

In [7]:
# Count live projects
live = ks.query('state == "live"')['ID'].count()

# Drop live projects
ks = ks.query('state != "live"')

# Check if live projects are dropped
print("Tuples dropped: {}".format(live - ks.query('state == "live"')['ID'].count()))

Tuples dropped: 2799


In [8]:
# Add outcome column, "successful" == 1, others are 0
ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

In [9]:
# number of unique elements in col `outcome`
print(ks.outcome.nunique())
print(pd.unique(ks.outcome))

2
[0 1]


In [10]:
# Drop state
# ks = ks.drop(['state'], axis=1)

### Additional features

We can convert the `launched` column that contain timestamps. Timestamps will be converted into different columns. 

Time values are accessed through the `.dt` attribute on timestamp column.

In [11]:
ks = ks.assign(hour=ks.launched.dt.hour,
               day=ks.launched.dt.day,
               month=ks.launched.dt.month,
               year=ks.launched.dt.year)
ks.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome,hour,day,month,year
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0,12,11,8,2015
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0,4,2,9,2017
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0,0,12,1,2013
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,0,3,17,3,2012
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,0,8,4,7,2015


### Prep categorical columns

Label Encode categorical columns using scikit-learn's `LabelEncoder`

In [12]:
ks.category.nunique()

159

In [13]:
# List Categorical cols
object_cols = [cname for cname in ks.columns
               if ks[cname].dtype=='object'
               and ks[cname].nunique() < 200]

In [14]:
object_cols

['category', 'main_category', 'currency', 'state', 'country']

In [15]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

encoded = ks[object_cols].apply(encoder.fit_transform)
encoded.head()

Unnamed: 0,category,main_category,currency,state,country
0,108,12,5,1,9
1,93,6,13,1,22
2,93,6,13,1,22
3,90,10,13,1,22
4,55,6,13,0,22


**Since** `main_category` **is derived from category, it is redundant. therfore, we will drop it.**

**We will also drop** `state` **since it is out target. Encoding it into the data will be the worst case of target-leakage.**

In [16]:
encoded = encoded.drop(['main_category', 'state'], axis=1)
encoded.head()

Unnamed: 0,category,currency,country
0,108,5,9
1,93,13,22
2,93,13,22
3,90,13,22
4,55,13,22


### Create dataframe

Collect all features in to a new dataframe. Since they have the same index they can be easily joined.

In [17]:
data = ks[['goal', 'hour', 'day', 'month', 'year', 'outcome']].join(encoded)
data.head()

Unnamed: 0,goal,hour,day,month,year,outcome,category,currency,country
0,1000.0,12,11,8,2015,0,108,5,9
1,30000.0,4,2,9,2017,0,93,13,22
2,45000.0,0,12,1,2013,0,93,13,22
3,5000.0,3,17,3,2012,0,90,13,22
4,19500.0,8,4,7,2015,0,55,13,22


### Create training, validation and testing datasets

80% training
10% validation
10% testing

In [18]:
valid_fraction = 0.1
valid_size = int(len(data) * valid_fraction)

train = data[:-2 * valid_size]
valid = data[-2 * valid_size: -valid_size]
test  = data[-valid_size:]

Check if each fraction has the same proportion of target classes. print fraction of succesfull classes of each fraction of the dataset.

In [19]:
for each in [train, valid, test]:
    print(f"Outcome fraction = {each.outcome.mean():.4f}")

Outcome fraction = 0.3570
Outcome fraction = 0.3539
Outcome fraction = 0.3542


Each fraction has about 35% of target classes. This is probably because the data has been randomized beforehand.

The data can be randomized automatically with `sklearn.modelselection.StratifiedShuffleSplit`, but since the data is already randomized, we do not need to use it here.

## Create and train model

We'll be using a `LightGMB` model. It is a tree-based model that typically performs better than `XGBoost` and are also reletively faster to train. We will do limited amount of hyperparamater tuning, if any. However, the model performance will improve as we do feature engineering.

In [20]:
import lightgbm as lgb

feature_cols = train.columns.drop('outcome')

dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=True)

[1]	valid_0's auc: 0.694192
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.697026
[3]	valid_0's auc: 0.70002
[4]	valid_0's auc: 0.701645
[5]	valid_0's auc: 0.70601
[6]	valid_0's auc: 0.707926
[7]	valid_0's auc: 0.70945
[8]	valid_0's auc: 0.710437
[9]	valid_0's auc: 0.712047
[10]	valid_0's auc: 0.713417
[11]	valid_0's auc: 0.714648
[12]	valid_0's auc: 0.715791
[13]	valid_0's auc: 0.717431
[14]	valid_0's auc: 0.718216
[15]	valid_0's auc: 0.719381
[16]	valid_0's auc: 0.720884
[17]	valid_0's auc: 0.721617
[18]	valid_0's auc: 0.722789
[19]	valid_0's auc: 0.723307
[20]	valid_0's auc: 0.72501
[21]	valid_0's auc: 0.725721
[22]	valid_0's auc: 0.727384
[23]	valid_0's auc: 0.728268
[24]	valid_0's auc: 0.72865
[25]	valid_0's auc: 0.729141
[26]	valid_0's auc: 0.729552
[27]	valid_0's auc: 0.730459
[28]	valid_0's auc: 0.731047
[29]	valid_0's auc: 0.732472
[30]	valid_0's auc: 0.732801
[31]	valid_0's auc: 0.733166
[32]	valid_0's auc: 0.734182
[33]	valid_0's auc: 0.73

## Making predictions & evaluating the model

 An important thing to remember is that you can overfit to the validation data. This is why we need a test set that the model never sees until the final evaluation.

In [21]:
from sklearn import metrics
ypred = bst.predict(test[feature_cols])
score = metrics.roc_auc_score(test['outcome'], ypred)

print(f"Test AUC score: {score}")

Test AUC score: 0.747615303004287
