In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
import re
import datetime
from nltk.corpus import stopwords


In [3]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')
project_id = test['project_id']

In [4]:
unix_cols = ['deadline','state_changed_at','launched_at','created_at']
for x in unix_cols:
    train[x] = train[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))
    test[x] = test[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))

In [5]:
cols_to_use = ['name','desc']
len_feats = ['name_len','desc_len']
count_feats = ['name_count','desc_count']

for i in np.arange(2):
    train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)
    test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)

In [6]:
train['name_count'] = train['name'].str.split().str.len()
train['desc_count'] = train['desc'].str.split().str.len()

test['name_count'] = test['name'].str.split().str.len()
test['desc_count'] = test['desc'].str.split().str.len()

In [7]:
train['keywords_len'] = train['keywords'].str.len()
train['keywords_count'] = train['keywords'].str.split('-').str.len()

test['keywords_len'] = test['keywords'].str.len()
test['keywords_count'] = test['keywords'].str.split('-').str.len()

In [8]:
unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    train[x] = train[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))
    test[x] = test[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))

In [9]:
time1 = (train['launched_at']-train['created_at']).astype('timedelta64[s]')
time3 = (train['deadline']-train['launched_at']).astype('timedelta64[s]')

In [10]:
train['time1'] = np.log(time1)
train['time3'] = np.log(time3)

In [11]:
time5 = (test['launched_at']-test['created_at']).astype('timedelta64[s]')
time6 = (test['deadline']-test['launched_at']).astype('timedelta64[s]')

In [12]:
test['time1'] = np.log(time5)
test['time3'] = np.log(time6)

In [13]:
feat = ['disable_communication','country']

for x in feat:
    le = LabelEncoder()
    le.fit(list(train[x].values) + list(test[x].values))
    train[x] = le.transform(list(train[x]))
    test[x] = le.transform(list(test[x]))

In [14]:
train['goal'] = np.log1p(train['goal'])
test['goal'] = np.log1p(test['goal'])

In [15]:
kickdesc = pd.Series(train['desc'].tolist() + test['desc'].tolist()).astype(str)

In [16]:
def desc_clean(word):
    p1 = re.sub(pattern='(\W+)|(\d+)|(\s+)',repl=' ',string=word)
    p1 = p1.lower()
    return p1

kickdesc = kickdesc.map(desc_clean)

In [17]:
stop = set(stopwords.words('english'))
kickdesc = [[x for x in x.split() if x not in stop] for x in kickdesc]

In [18]:
stemmer = SnowballStemmer(language='english')
kickdesc = [[stemmer.stem(x) for x in x] for x in kickdesc]

In [19]:
kickdesc = [[x for x in x if len(x) > 2] for x in kickdesc]
kickdesc = [' '.join(x) for x in kickdesc]

In [20]:
cv = CountVectorizer(max_features=650)

In [21]:
alldesc = cv.fit_transform(kickdesc).todense()

In [22]:
combine = pd.DataFrame(alldesc)

In [23]:
combine.rename(columns= lambda x: 'variable_'+ str(x), inplace=True)

In [24]:
train_text = combine[:train.shape[0]]
test_text = combine[train.shape[0]:]

test_text.reset_index(drop=True,inplace=True)

In [25]:
cols_to_use = ['name_len','desc_len','keywords_len','name_count','desc_count','keywords_count','time1','time3','goal']

In [26]:
target = train['final_status']

In [27]:
train = train.loc[:,cols_to_use]
test = test.loc[:,cols_to_use]

In [28]:
X_train = pd.concat([train, train_text],axis=1)
X_test = pd.concat([test, test_text],axis=1)

In [29]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_Y, val_Y = train_test_split(X_train, target, test_size=0.40, random_state=2017);

In [31]:
import xgboost as xgb
preds = X_train.columns
dtrain = xgb.DMatrix(train_X, train_Y, feature_names=preds)
dval = xgb.DMatrix(val_X, val_Y, feature_names=preds)                     
dtrain_all = xgb.DMatrix(X_train, target, feature_names=preds)
dtest = xgb.DMatrix(data=X_test, feature_names=preds)



In [53]:
params = {
    'objective':'binary:logistic',
    'eval_metric':'error',
    'eta':0.025,
    'max_depth':7,
    'subsample':0.7,
    'colsample_bytree':0.7,
    'min_child_weight':5,
    'seed': 2017    
}

In [54]:
watchlist =[(dtrain,'Training'), (dval, 'Validation')]
num_rounds = 2500
model = xgb.train(params, dtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=25 )

[0]	Training-error:0.311112	Validation-error:0.318598
Multiple eval metrics have been passed: 'Validation-error' will be used for early stopping.

Will train until Validation-error hasn't improved in 50 rounds.
[25]	Training-error:0.30359	Validation-error:0.31076
[50]	Training-error:0.29818	Validation-error:0.308263
[75]	Training-error:0.293386	Validation-error:0.304841
[100]	Training-error:0.290504	Validation-error:0.302321
[125]	Training-error:0.287359	Validation-error:0.300587
[150]	Training-error:0.285355	Validation-error:0.299293
[175]	Training-error:0.283244	Validation-error:0.298229
[200]	Training-error:0.280993	Validation-error:0.296634
[225]	Training-error:0.278712	Validation-error:0.295824
[250]	Training-error:0.277217	Validation-error:0.29564
[275]	Training-error:0.276076	Validation-error:0.294992
[300]	Training-error:0.274304	Validation-error:0.294414
[325]	Training-error:0.272716	Validation-error:0.293929
[350]	Training-error:0.270697	Validation-error:0.293189
[375]	Traini

In [44]:
model = xgb.train(params, dtrain_all, num_boost_round=int(681/0.95))

In [45]:
prediction = model.predict(dtest)

In [46]:
sub = pd.DataFrame()
sub['project_id'] = project_id
sub['final_status'] = prediction

In [47]:
sub['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]

In [48]:
sub.to_csv("subm.csv",index=False) #0.70