In [37]:
import pandas as pd
import numpy as np
from datetime import datetime, timezone
from currency_converter import CurrencyConverter
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm

### load data

In [38]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

In [39]:
print ('The train data has {} rows and {} columns'.format(train.shape[0],train.shape[1]))
print ('The test data has {} rows and {} columns'.format(test.shape[0],test.shape[1]))

The train data has 108129 rows and 14 columns
The test data has 63465 rows and 12 columns


### convert time to unix format

In [40]:
import time

unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    train[x] = train[x].apply(lambda k: datetime.fromtimestamp(k, timezone.utc))
    test[x] = test[x].apply(lambda k: datetime.fromtimestamp(k, timezone.utc))

### convert goals to USD

In [41]:
c = CurrencyConverter(fallback_on_wrong_date=True, fallback_on_missing_rate=True,)
train['goal_usd'] = train[['goal', 'currency', 'created_at']].apply(lambda x: c.convert(x['goal'], x['currency'], 'USD', date=x['created_at']), axis=1)
test['goal_usd'] = test[['goal', 'currency', 'created_at']].apply(lambda x: c.convert(x['goal'], x['currency'], 'USD', date=x['created_at']), axis=1)


### create simple features

In [43]:
cols_to_use = ['name','desc']
len_feats = ['name_len','desc_len']
count_feats = ['name_count','desc_count']

In [44]:
for i in np.arange(2):
    train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)
    train[count_feats[i]] = train[cols_to_use[i]].apply(str).apply(lambda x: len(x.split(' ')))
    
train['keywords_len'] = train['keywords'].apply(str).apply(len)
train['keywords_count'] = train['keywords'].apply(str).apply(lambda x: len(x.split('-')))

In [45]:
for i in np.arange(2):
    test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)
    test[count_feats[i]] = test[cols_to_use[i]].apply(str).apply(lambda x: len(x.split(' ')))
    
test['keywords_len'] = test['keywords'].apply(str).apply(len)
test['keywords_count'] = test['keywords'].apply(str).apply(lambda x: len(x.split('-')))

In [47]:
# capital letters in project name

train['name_capitals'] = train['name'].str.count('[A-Z]')
test['name_capitals'] = test['name'].str.count('[A-Z]')

# digits in project name

train['name_digits'] = train['name'].str.count('[0-9]')
test['name_digits'] = test['name'].str.count('[0-9]')

train['name_digits_any'] = train['name_digits'].apply(lambda x: int(x > 0))
test['name_digits_any'] = test['name_digits'].apply(lambda x: int(x > 0))

train = train[pd.notnull(train['name_capitals'])]

In [49]:
# ration of digits in project description

train['desc_digits'] = train['desc'].str.count('[0-9]')
test['desc_digits'] = test['desc'].str.count('[0-9]')

train = train[pd.notnull(train['desc_digits'])]

In [50]:
# campaign start week day

train['launched_at_weekday'] = train['launched_at'].apply(lambda x: x.weekday())
test['launched_at_weekday'] = test['launched_at'].apply(lambda x: x.weekday())

# deadline week day

train['deadline_weekday'] = train['deadline'].apply(lambda x: x.weekday())
test['deadline_weekday'] = test['deadline'].apply(lambda x: x.weekday())

In [51]:
# campaign length

train['campaign_len'] = (train['deadline'] - train['launched_at'])
test['campaign_len'] = (test['deadline'] - test['launched_at'])

train['campaign_len'] = train['campaign_len'].apply(lambda x: pd.tslib.Timedelta(x).days)
test['campaign_len'] = test['campaign_len'].apply(lambda x: pd.tslib.Timedelta(x).days)

### encoding features

In [52]:
from sklearn.preprocessing import LabelEncoder

feat = ['disable_communication','country']
for x in feat:
    le = LabelEncoder()
    le.fit(list(train[x]) + list(test[x]))
    train[x] = le.transform(list(train[x]))
    test[x] = le.transform(list(test[x]))

### train/test split

In [66]:
train_params, test_params, train_labels, test_labels = train_test_split(train, train['final_status'], train_size=0.8, random_state=1)

cols_to_use = ['goal_usd'
              ,'campaign_len'
             ,'country'
             #,'name_len'
             #,'desc_len'
             #,'keywords_len'
             #,'name_count'
             ,'desc_count'
             ,'keywords_count'
             #,'launched_at_weekday'
             #,'deadline_weekday'
             #,'name_digits'
             #,'desc_digits'
             ,'name_digits_any'
             ]

train_params = train_params[cols_to_use]
test_params = test_params[cols_to_use]

### classify

In [67]:
clf = AdaBoostClassifier(n_estimators=200)
clf.fit(train_params, train_labels)
clf.score(test_params, test_labels)

0.6910982658959538

### predict & write to csv

In [62]:
print ('The test data has {} rows and {} columns'.format(test.shape[0],test.shape[1]))

The test data has 63465 rows and 26 columns


In [63]:
clf = AdaBoostClassifier(n_estimators=200)
ada_pred = clf.fit(train[cols_to_use], train['final_status']).predict(test[cols_to_use])

adaSub = pd.DataFrame({'project_id':test['project_id'],'final_status':ada_pred})
adaSub = adaSub[['project_id','final_status']]
adaSub.to_csv("./output/adaBoost.csv",index = False)