## Funding Successful Projects - Hackerearth Contest

Link: [Funding Sucessful Projects](https://www.hackerearth.com/challenge/competitive/machine-learning-challenge-2/machine-learning/funding-successful-projects/)

**Author: Sethu Iyer **

In [1]:
#1. Import relevant libraries and load the data
import pandas as pd
import numpy as np
from datetime import datetime
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print ('The train data has {} rows and {} columns'.format(train.shape[0],train.shape[1]))
print ('The test data has {} rows and {} columns'.format(test.shape[0],test.shape[1]))
print(train.columns)

The train data has 108129 rows and 14 columns
The test data has 63465 rows and 12 columns
Index(['project_id', 'name', 'desc', 'goal', 'keywords',
       'disable_communication', 'country', 'currency', 'deadline',
       'state_changed_at', 'created_at', 'launched_at', 'backers_count',
       'final_status'],
      dtype='object')


In [2]:
#2. Visualize train data first
print(train.head(n=2))
print(train['desc'].head(n=1))
difference = 0.663955515767 - 0.652  #the baseline model difference between validation accuracy and test accuracy.

       project_id                                               name  \
0  kkst1451568084                                drawing for dollars   
1  kkst1474482071  Sponsor Dereck Blackburn (Lostwars) Artist in ...   

                                                desc   goal  \
0  I like drawing pictures. and then i color them...   20.0   
1  I, Dereck Blackburn will be taking upon an inc...  300.0   

                                            keywords  disable_communication  \
0                                drawing-for-dollars                  False   
1  sponsor-dereck-blackburn-lostwars-artist-in-re...                  False   

  country currency    deadline  state_changed_at  created_at  launched_at  \
0      US      USD  1241333999        1241334017  1240600507   1240602723   
1      US      USD  1242429000        1242432018  1240960224   1240975592   

   backers_count  final_status  
0              3             1  
1              2             0  
0    I like drawing pict

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn import cross_validation
from sklearn.pipeline import Pipeline

train.dropna(axis=0,how='any')
test.dropna(axis=0,how='any')

text_clf_sgd=Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer(use_idf=True)),('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])
text_clf_nvb=Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer(use_idf=True)),('clf', MultinomialNB())])

X_train=train['desc'].values.astype('U')
target = train['final_status'].values.astype(np.int32)
X_test = test['desc'].values.astype('U')

eclf1 = VotingClassifier(estimators=[('sgd', text_clf_sgd), ('nvb', text_clf_nvb)],voting='hard',weights=[1,2])
kfold=cross_validation.StratifiedKFold(target,10)
print(np.mean(cross_validation.cross_val_score(eclf1,X_train,target,cv=kfold,n_jobs=-1)) - difference)

eclf1.fit(X_train,target)
eclf_pred=eclf1.predict(X_test)



0.678285012489


In [4]:
from sklearn.preprocessing import LabelEncoder

feat = ['disable_communication','country']
for x in feat:
    le = LabelEncoder()
    le.fit(list(train[x].values) + list(test[x].values))
    train[x] = le.transform(list(train[x]))
    test[x] = le.transform(list(test[x].values))

colns_to_use=['goal','disable_communication','country']
X_train = train[colns_to_use]
X_test = test[colns_to_use]

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=300,max_depth=8,max_features=2)
kfold=cross_validation.StratifiedKFold(target,10)
print(np.mean(cross_validation.cross_val_score(rfc,X_train,target,cv=kfold,n_jobs=-1)) - difference)
rfc.fit(X_train,target)
rfc_pred=rfc.predict(X_test)

0.666428678982


In [5]:
print(train['final_status'].value_counts()) #more likely to get rejected than selected

0    73568
1    34561
Name: final_status, dtype: int64


In [6]:
final_pred = rfc_pred * eclf_pred #if any one of them becomes 1, final prediction becomes one. 

submission = pd.DataFrame({'project_id':test['project_id'],'final_status':final_pred})
submission = submission[['project_id','final_status']]
submission.to_csv("pipeline_plus_goal.csv",index = False)