In [1]:
# load libraries

import pandas as pd
import numpy as np
import re
import datetime
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb

pd.set_option('display.max_colwidth',100)

In [29]:
#load data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
print train.shape
print test.shape

(108129, 14)
(63465, 12)


In [4]:
train.columns

Index([u'project_id', u'name', u'desc', u'goal', u'keywords',
       u'disable_communication', u'country', u'currency', u'deadline',
       u'state_changed_at', u'created_at', u'launched_at', u'backers_count',
       u'final_status'],
      dtype='object')

In [5]:
train.head()

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at,backers_count,final_status
0,kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them too. so i thought i would suggest something for m...,20.0,drawing-for-dollars,False,US,USD,1241333999,1241334017,1240600507,1240602723,3,1
1,kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in Residence in Kankakee Illinois,"I, Dereck Blackburn will be taking upon an incredible journey in the month of May 2009. I will b...",300.0,sponsor-dereck-blackburn-lostwars-artist-in-residence-in-kankakee-illinois,False,US,USD,1242429000,1242432018,1240960224,1240975592,2,0
2,kkst183622197,Mr. Squiggles,"So I saw darkpony's successfully funded drawing for dollars project and I thought """"""""""""""""""""""""""""...",30.0,mr-squiggles,False,US,USD,1243027560,1243027818,1242163613,1242164398,0,0
3,kkst597742710,Help me write my second novel.,Do your part to help out starving artists and help me write my second novel!\r\n\r\nI have just ...,500.0,help-me-write-my-second-novel,False,US,USD,1243555740,1243556121,1240963795,1240966730,18,1
4,kkst1913131122,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, currently titled """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""...",2000.0,support-casting-my-sculpture-in-bronze,False,US,USD,1243769880,1243770317,1241177914,1241180541,1,0


### Currency and Country

In [6]:
train.groupby(train['country']).final_status.value_counts(normalize=True)

country  final_status
AU       0               0.817553
         1               0.182447
CA       0               0.775964
         1               0.224036
DE       0               1.000000
DK       0               0.775510
         1               0.224490
GB       0               0.713405
         1               0.286595
IE       0               0.792793
         1               0.207207
NL       0               0.875177
         1               0.124823
NO       0               0.842105
         1               0.157895
NZ       0               0.774648
         1               0.225352
SE       0               0.812500
         1               0.187500
US       0               0.667804
         1               0.332196
Name: final_status, dtype: float64

In [7]:
train.groupby(train['currency']).final_status.value_counts(normalize=True)

currency  final_status
AUD       0               0.817553
          1               0.182447
CAD       0               0.775964
          1               0.224036
DKK       0               0.775510
          1               0.224490
EUR       0               0.864137
          1               0.135863
GBP       0               0.713405
          1               0.286595
NOK       0               0.842105
          1               0.157895
NZD       0               0.774648
          1               0.225352
SEK       0               0.812500
          1               0.187500
USD       0               0.667804
          1               0.332196
Name: final_status, dtype: float64

In [69]:
train_currency_dummies = pd.get_dummies(train['currency'])

In [71]:
train_currency_dummies.head()

Unnamed: 0,AUD,CAD,DKK,EUR,GBP,NOK,NZD,SEK,USD
0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,1


In [75]:
train = pd.concat([train,train_currency_dummies],axis = 1)

In [77]:
test_currency_dummies = pd.get_dummies(test['currency'])
test = pd.concat([test,test_currency_dummies],axis = 1)

In [102]:
currency_list = list((train_currency_dummies.columns))
print currency_list

['AUD', 'CAD', 'DKK', 'EUR', 'GBP', 'NOK', 'NZD', 'SEK', 'USD']


Not much difference at all, we can consider either of these two as features, I chose currency arbitarily..

### Time Data

In [30]:
days = (24*60*60*1.0)

In [31]:
train['d_s'] = ((train['state_changed_at']) < (train['deadline']))
train['l_c'] = ((train['launched_at']) - (train['created_at']))/days
train['s_l'] = ((train['state_changed_at']) - (train['launched_at']))/days

In [32]:
train.groupby('final_status').d_s.value_counts()

final_status  d_s  
0             False    67688
              True      5880
1             False    34561
Name: d_s, dtype: int64

In [44]:
(train['final_status'][train['s_l']>89]).value_counts()

0    581
1    225
Name: final_status, dtype: int64

In [34]:
train.groupby('final_status').l_c.mean()

final_status
0    35.771002
1    41.982384
Name: l_c, dtype: float64

In [35]:
train.final_status.value_counts(normalize=True)

0    0.680373
1    0.319627
Name: final_status, dtype: float64

In [22]:
# convert unix time format
unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    train[x] = train[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))
    test[x] = test[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))


In [78]:
test['d_s'] = (test['state_changed_at'] < test['deadline'])

### Backers_count and Goal

In [47]:
train.groupby('final_status').backers_count.mean()

final_status
0     29.341534
1    323.981800
Name: backers_count, dtype: float64

In [49]:
train.groupby('final_status').goal.mean()

final_status
0    49372.588911
1     9806.654217
Name: goal, dtype: float64

### Text Data in terms of Length and Count

In [50]:
cols_to_use = ['name','desc']
len_feats = ['name_len','desc_len']
count_feats = ['name_count','desc_count']

for i in np.arange(2):
    train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)
    test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)

In [51]:
train['name_count'] = train['name'].str.split().str.len()
train['desc_count'] = train['desc'].str.split().str.len()

test['name_count'] = test['name'].str.split().str.len()
test['desc_count'] = test['desc'].str.split().str.len()

In [52]:
train['keywords_len'] = train['keywords'].str.len()
train['keywords_count'] = train['keywords'].str.split('-').str.len()

test['keywords_len'] = test['keywords'].str.len()
test['keywords_count'] = test['keywords'].str.split('-').str.len()

In [53]:
train.groupby('final_status').name_len.mean()

final_status
0    50.562378
1    54.154018
Name: name_len, dtype: float64

In [54]:
train.groupby('final_status').name_count.mean()

final_status
0    5.552748
1    5.977605
Name: name_count, dtype: float64

In [55]:
train.groupby('final_status').desc_len.mean()

final_status
0    133.618353
1    135.113886
Name: desc_len, dtype: float64

In [56]:
train.groupby('final_status').desc_count.mean()

final_status
0    19.537724
1    19.479326
Name: desc_count, dtype: float64

In [59]:
train.groupby('final_status').keywords_len.mean()

final_status
0    30.965148
1    33.123868
Name: keywords_len, dtype: float64

In [58]:
train.groupby('final_status').keywords_count.mean()

final_status
0    5.293973
1    5.707300
Name: keywords_count, dtype: float64

None of these features are useful at all

### Text Features

In [60]:
# creating a full list of descriptions from train and etst
kickdesc = pd.Series(train['desc'].tolist() + test['desc'].tolist()).astype(str)

In [61]:
# this function cleans punctuations, digits and irregular tabs. Then converts the sentences to lower
def desc_clean(word):
    p1 = re.sub(pattern='(\W+)|(\d+)|(\s+)',repl=' ',string=word)
    p1 = p1.lower()
    return p1

kickdesc = kickdesc.map(desc_clean)

In [62]:
stop = set(stopwords.words('english'))
kickdesc = [[x for x in x.split() if x not in stop] for x in kickdesc]

stemmer = SnowballStemmer(language='english')
kickdesc = [[stemmer.stem(x) for x in x] for x in kickdesc]

kickdesc = [[x for x in x if len(x) > 2] for x in kickdesc]

kickdesc = [' '.join(x) for x in kickdesc]

In [63]:
# Due to memory error, limited the number of features to 650
cv = CountVectorizer(max_features=650)

In [64]:
alldesc = cv.fit_transform(kickdesc).todense()

In [65]:
#create a data frame
combine = pd.DataFrame(alldesc)
combine.rename(columns= lambda x: 'variable_'+ str(x), inplace=True)

### Data Adjusting for Training

In [66]:
#split the text features

train_text = combine[:train.shape[0]]
test_text = combine[train.shape[0]:]

test_text.reset_index(drop=True,inplace=True)

In [103]:
cols = ['goal','d_s'] + currency_list
print cols

['goal', 'd_s', 'AUD', 'CAD', 'DKK', 'EUR', 'GBP', 'NOK', 'NZD', 'SEK', 'USD']


In [104]:
X_train = train[cols]
X_test = test[cols]

In [105]:
X_train = pd.concat([X_train, train_text],axis=1)
X_test = pd.concat([X_test, test_text],axis=1)

In [106]:
print X_train.shape
print X_test.shape

(108129, 661)
(63465, 661)


In [108]:
target = train['final_status']

### Fitting and Training

In [116]:
X_train = train_text
X_test = test_text

In [117]:
dtrain = xgb.DMatrix(data=X_train, label = target)
dtest = xgb.DMatrix(data=X_test)

In [118]:
params = {
    'objective':'binary:logistic',
    'eval_metric':'error',
    'eta':0.025,
    'max_depth':6,
    'subsample':0.7,
    'colsample_bytree':0.7,
    'min_child_weight':5
    
}

In [119]:
bst_train = xgb.train(params, dtrain, num_boost_round=1000)

In [120]:
p_test = bst_train.predict(dtest)

In [121]:
sub = pd.DataFrame()
sub['project_id'] = test['project_id']
sub['final_status'] = p_test
sub.head()

Unnamed: 0,project_id,final_status
0,kkst917493670,0.260795
1,kkst1664901914,0.294762
2,kkst925125077,0.358842
3,kkst1427645275,0.284261
4,kkst1714249266,0.314012


In [122]:
sub['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]
sub['final_status'].value_counts(normalize = True)

0    0.950965
1    0.049035
Name: final_status, dtype: float64

In [123]:
sub.to_csv("sub1_xgb.csv",index=False) #0.70

In [125]:
sub['final_status'].value_counts(normalize = True)

0    0.950965
1    0.049035
Name: final_status, dtype: float64

# USING DL

In [127]:
text_cols = ['name','desc','keywords']
train[text_cols].head()

Unnamed: 0,name,desc,keywords
0,drawing for dollars,I like drawing pictures. and then i color them too. so i thought i would suggest something for m...,drawing-for-dollars
1,Sponsor Dereck Blackburn (Lostwars) Artist in Residence in Kankakee Illinois,"I, Dereck Blackburn will be taking upon an incredible journey in the month of May 2009. I will b...",sponsor-dereck-blackburn-lostwars-artist-in-residence-in-kankakee-illinois
2,Mr. Squiggles,"So I saw darkpony's successfully funded drawing for dollars project and I thought """"""""""""""""""""""""""""...",mr-squiggles
3,Help me write my second novel.,Do your part to help out starving artists and help me write my second novel!\r\n\r\nI have just ...,help-me-write-my-second-novel
4,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, currently titled """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""...",support-casting-my-sculpture-in-bronze


In [129]:
train.columns

Index([u'project_id', u'name', u'desc', u'goal', u'keywords',
       u'disable_communication', u'country', u'currency', u'deadline',
       u'state_changed_at', u'created_at', u'launched_at', u'backers_count',
       u'final_status', u'd_s', u'l_c', u's_l', u'name_len', u'desc_len',
       u'name_count', u'desc_count', u'keywords_len', u'keywords_count',
       u'AUD', u'CAD', u'DKK', u'EUR', u'GBP', u'NOK', u'NZD', u'SEK', u'USD'],
      dtype='object')

In [131]:
ser = pd.Series(train['keywords_len'] - train['name_len'])

In [138]:
train[['name','keywords']].head(150)

Unnamed: 0,name,keywords
0,drawing for dollars,drawing-for-dollars
1,Sponsor Dereck Blackburn (Lostwars) Artist in Residence in Kankakee Illinois,sponsor-dereck-blackburn-lostwars-artist-in-residence-in-kankakee-illinois
2,Mr. Squiggles,mr-squiggles
3,Help me write my second novel.,help-me-write-my-second-novel
4,Support casting my sculpture in bronze,support-casting-my-sculpture-in-bronze
5,daily digest,daily-digest
6,iGoozex - Free iPhone app,igoozex-free-iphone-app
7,Drive A Faster Car 2.0,drive-a-faster-car-20
8,"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""...",lostles-at-tinys-giant
9,Choose Your Own Adventure - A Robot Painting Series,choose-your-own-adventure-a-robot-painting-series
