#Analytics Vidhya Hackathon 3

Analytics Vidhya threw open a data hackathon for the 8th and 9th August weekend.
Details can be found [here](http://discuss.analyticsvidhya.com/t/online-hackathon-3-0-find-the-next-brain-wong/2838)
This is my attempt to generate some benchmarks - with no(to minimal) feature engineering

In [2]:
#Import the necessary libraries
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import linear_model
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR

In [3]:
#Read the train and test datasets
train = pd.read_csv("Data/Train.csv")
test = pd.read_csv("Data/Test.csv")

In [4]:
print train.shape, test.shape

(34397, 27) (22950, 27)


In [5]:
#Most of the columns are categorical variables.
#Convert them to numeric using sklearn's preprocessing module

In [6]:
frames = [train, test]
input = pd.concat(frames)

In [7]:
#Drop the following columns:
#ID, latitude, longtitude, city, zip

input.drop(input.columns[[0,1,2,4,6]], axis=1, inplace=True)

In [8]:
#sanity check
input.head()

Unnamed: 0,Var4,institute_state,Var8,institute_country,Var10,Var11,Var12,Var13,Var14,Var15,...,project_subject,subject_area,secondary_subject,secondary_area,Resource_Category,Resource_Sub_Category,Var23,Var24,Similar_Project_Valuation_other_institute,Project_Valuation
0,SA01,TX,HXYD,Harris,N,N,N,N,N,HAXXF,...,Environmental Science,Math & Science,,,Technology,O141,BB,D,253,202
1,SA03,IN,HXYD,Elkhart,N,N,N,N,N,HAXXC,...,Early Development,Applied Learning,Literacy,Literacy & Language,Supplies,E41,BB,D,246,0
2,SA02,NC,HXYC,Cabarrus,N,N,N,N,N,HAXXF,...,Literature & Writing,Literacy & Language,Literacy,Literacy & Language,Supplies,F51,BB,A,183,0
3,SA02,AL,HXYM,Cullman,N,N,N,N,N,HAXXF,...,Literature & Writing,Literacy & Language,Mathematics,Math & Science,Technology,N131,AA,A,226,916
4,SA01,SC,HXYF,Greenville,N,N,N,N,N,HAXXF,...,Literacy,Literacy & Language,ESL,Literacy & Language,Books,G61,AA,B,266,0


#Two approaches to solving this.
###*One approach*: Directly predict the project valuation
###*Another approach*: Since a lot of the project valuation are 0, first predict probability of project valuation to be greater than 0, and if it is greater than 0, what is the project valuation. Second approach involves: first level of classification and second level of regression


In [9]:
input['pv_code'] = 0
input.loc[input['Project_Valuation']>0, 'pv_code'] = 1

In [10]:
input.head()

Unnamed: 0,Var4,institute_state,Var8,institute_country,Var10,Var11,Var12,Var13,Var14,Var15,...,subject_area,secondary_subject,secondary_area,Resource_Category,Resource_Sub_Category,Var23,Var24,Similar_Project_Valuation_other_institute,Project_Valuation,pv_code
0,SA01,TX,HXYD,Harris,N,N,N,N,N,HAXXF,...,Math & Science,,,Technology,O141,BB,D,253,202,1
1,SA03,IN,HXYD,Elkhart,N,N,N,N,N,HAXXC,...,Applied Learning,Literacy,Literacy & Language,Supplies,E41,BB,D,246,0,0
2,SA02,NC,HXYC,Cabarrus,N,N,N,N,N,HAXXF,...,Literacy & Language,Literacy,Literacy & Language,Supplies,F51,BB,A,183,0,0
3,SA02,AL,HXYM,Cullman,N,N,N,N,N,HAXXF,...,Literacy & Language,Mathematics,Math & Science,Technology,N131,AA,A,226,916,1
4,SA01,SC,HXYF,Greenville,N,N,N,N,N,HAXXF,...,Literacy & Language,ESL,Literacy & Language,Books,G61,AA,B,266,0,0


In [11]:
#Replace missing values with NA
input.fillna("NA", inplace=True)

In [12]:
input = np.array(input)

In [13]:
#Convert categorical to numeric
for i in range(20):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(input[:,i]))
    input[:, i] = lbl.transform(input[:, i])

In [14]:
#Create train, test and target labels datasets
xgtrain, label_prob, label_pv = input[0:train.shape[0], 0:21], input[0:train.shape[0], 22], input[0:train.shape[0], 21]
xgtest = input[train.shape[0]:,0:21]

xgtrain = xgtrain.astype(float)
label_prob = label_prob.astype(float)
label_pv = label_pv.astype(float)
xgtest = xgtest.astype(float)


# First Approach - Regression


##First model: xgboost

In [15]:
params = {}
params["min_child_weight"] = 3
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 1
params["silent"] = 0
params["max_depth"] = 4
params["nthread"] = 6
params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.005
params["base_score"] = 0.1
params["eval_metric"] = "auc"
params["seed"] = 123

In [16]:
plst = list(params.items())
num_rounds = 600

In [17]:
xgtrain_pv = xgb.DMatrix(xgtrain, label=label_pv)
watchlist = [(xgtrain_pv, 'train')]
model_1_xgboost = xgb.train(plst, xgtrain_pv, num_rounds)
model_1_predict = model_1_xgboost.predict(xgb.DMatrix(xgtest))

In [None]:
print max(model_1_predict), min(model_1_predict)
model_1_predict[model_1_predict<0] = 0

In [None]:
prediction_1 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_1_predict})
print prediction_1.head()
prediction_1.to_csv('Submission/model_1.csv', sep=",", index=False)

In [None]:
#rmse on leaderboard: 518

##Model 2: RandomForest

In [None]:
model_2_rf = RandomForestRegressor(n_estimators=400, max_depth=8, oob_score=True, n_jobs=6, random_state=123)
model_2_rf.fit(xgtrain, label_pv)
model_2_rf.oob_score_
model_2_predict = model_2_rf.predict(xgtest)

In [None]:
prediction_2 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_2_predict})
print prediction_2.head()
prediction_2.to_csv('Submission/model_2.csv', sep=",", index=False)

In [None]:
#rmse on leaderboard: 842

##Model 3: ExtraTrees

In [None]:
model_3_et = ExtraTreesRegressor(n_estimators=750, max_depth=8, oob_score=True, n_jobs=6, random_state=123, verbose=1, bootstrap=True)
model_3_et.fit(xgtrain, label_pv)
model_3_predict = model_3_et.predict(xgtest)

In [None]:
prediction_3 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_3_predict})
print prediction_3.head()
prediction_3.to_csv('Submission/model_3.csv', sep=",", index=False)

In [None]:
#rmse on leaderboard: 497.2


# Creating dummies instead of numeric encoding of categorical variables



In [None]:
#Begin data prep
frames = [train, test]
input2 = pd.concat(frames)

#Drop the following columns:
#ID, latitude, longtitude, city, zip

input2.drop(input2.columns[[0,1,2,4,6]], axis=1, inplace=True)

#sanity check
input2.head()

input2['pv_code'] = 0
input2.loc[input2['Project_Valuation']>0, 'pv_code'] = 1

input2.head()

#Replace missing values with NA
input2.fillna("NA", inplace=True)

input2.ix[:,:21].head()
input2.loc[input2.Project_Valuation=="NA","Project_Valuation"] = 0
input2.Project_Valuation = input2.Project_Valuation.astype(float)
input2.dtypes
#input2_dummy = pd.get_dummies(input2.ix[:,:21], dummy_na=True,

input2_dummy = pd.get_dummies(input2, dummy_na=True)
input2_dummy.shape
input2_dummy.head()
input2_dummy.columns

input2 = np.array(input2)

xgtrain2     = input2_dummy.drop(input2_dummy.columns[[1,2]], axis=1)
xgtrain2     = np.array(xgtrain2)
xgtrain2     = xgtrain2[0:train.shape[0], :]

#label_prob2  = input2[0:train.shape[0], 22]
#label_pv2    = input2[0:train.shape[0], 21]
xgtest2      = input2_dummy.drop(input2_dummy.columns[[1,2]], axis=1)
xgtest2      = np.array(xgtest2)
xgtest2      = xgtest2[train.shape[0]:,:]

xgtrain2 = xgtrain2.astype(float)
#label_prob = label_prob.astype(float)
#label_pv = label_pv.astype(float)
xgtest2 = xgtest2.astype(float)

#End of data prep

## Repeating the above 3 models on xgtrain2

### Model 4: xgboost

In [None]:
xgtrain2_pv = xgb.DMatrix(xgtrain2, label=label_pv)
watchlist = [(xgtrain2_pv, 'train')]
model_4_xgboost = xgb.train(plst, xgtrain2_pv, num_rounds)

In [None]:
model_4_predict = model_4_xgboost.predict(xgb.DMatrix(xgtest2))
print max(model_4_predict), min(model_4_predict)
model_4_predict[model_4_predict<0] = 0

In [None]:
prediction_4 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_4_predict})
print prediction_4.head()
prediction_4.to_csv('Submission/model_4.csv', sep=",", index=False)

In [None]:
#rmse on leaderboard: 565

###Model 5: random forest

In [None]:
model_5_rf = RandomForestRegressor(n_estimators=400, max_depth=8, oob_score=True, n_jobs=6, random_state=123)
model_5_rf.fit(xgtrain2, label_pv)
print model_5_rf.oob_score_

In [None]:
model_5_predict = model_5_rf.predict(xgtest2)

prediction_5 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_5_predict})
print prediction_5.head()
prediction_5.to_csv('Submission/model_5.csv', sep=",", index=False)

In [None]:
#rmse on 509.58

##Model 6: extra trees

In [None]:
model_6_et = ExtraTreesRegressor(n_estimators=750, max_depth=8, oob_score=True, n_jobs=6, random_state=123, verbose=1, bootstrap=True)
model_6_et.fit(xgtrain2, label_pv)
model_6_predict = model_6_et.predict(xgtest2)

In [None]:
prediction_6 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_6_predict})
print prediction_6.head()
prediction_6.to_csv('Submission/model_6.csv', sep=",", index=False)

In [None]:
#rmse 496

##Model 7: Ridge

In [None]:
model_7_ridge = linear_model.Ridge(alpha=0.01)
model_7_ridge.fit(xgtrain2, label_pv)
model_7_predict = model_7_ridge.predict(xgtest2)

In [None]:
prediction_7 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_7_predict})
print prediction_7.head()
prediction_7.to_csv('Submission/model_7.csv', sep=",", index=False)

In [None]:
#rmse of 505.99

##Model 8: Lasso

In [None]:
model_8_lasso = linear_model.Lasso()
model_8_lasso.fit(xgtrain2, label_pv)
model_8_predict = model_8_lasso.predict(xgtest2)

In [None]:
prediction_8 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_8_predict})
print prediction_8.head()
prediction_8.to_csv('Submission/model_8.csv', sep=",", index=False)

In [None]:
#rmse of 498

##Model 9: SGD Regressor

In [None]:
model_9_sgd = linear_model.SGDRegressor()
model_9_sgd.fit(xgtrain2, label_pv)
model_9_predict = model_9_sgd.predict(xgtest2)

prediction_9 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_9_predict})
print prediction_9.head()
prediction_9.to_csv('Submission/model_9.csv', sep=",", index=False)

In [None]:
#Omit this model. Horrible output

##Model 10: Perceptron

In [None]:
model_10_perceptron = linear_model.Perceptron(penalty="l1", n_iter=250, random_state=123, n_jobs=6)
model_10_perceptron.fit(xgtrain2, label_pv)
model_10_predict = model_10_perceptron.predict(xgtest2)

prediction_10 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_10_predict})
print prediction_10.head()
prediction_10.to_csv('Submission/model_10.csv', sep=",", index=False)

In [None]:
#Omit this model. Takes a long time to run

##Model 11: Kernel Ridge

In [None]:
model_11_kr = KernelRidge(alpha=0.01, kernel='rbf', gamma=0.1)
model_11_kr.fit(xgtrain2, label_pv)
model_11_predict = model_11_kr.predict(xgtest2)

prediction_11 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_11_predict})
print prediction_11.head()
prediction_11.to_csv('Submission/model_11.csv', sep=",", index=False)

In [None]:
#rmse: 753

#Model 12: SVR

In [None]:
model_12_svr = KernelRidge(alpha=0.01, kernel='rbf')
model_12_svr.fit(xgtrain2, label_pv)
model_12_predict = model_12_kr.predict(xgtest2)

prediction_12 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_12_predict})
print prediction_12.head()
prediction_12.to_csv('Submission/model_12.csv', sep=",", index=False)

In [None]:
#Not going to run this - due to time constraints. Kernel ridge itself took a long time to run.

#Stacking

Stacking the above 9 models to see if it produces better output

In [None]:
feature_1 = model_1_xgboost.predict(xgb.DMatrix(xgtrain))
feature_2 = model_2_rf.predict(xgtrain)
feature_3 = model_3_et.predict(xgtrain)
feature_4 = model_4_xgboost.predict(xgb.DMatrix(xgtrain2))
feature_5 = model_5_rf.predict(xgtrain2)
feature_6 = model_6_et.predict(xgtrain2)
feature_7 = model_7_ridge.predict(xgtrain2)
feature_8 = model_8_lasso.predict(xgtrain2)
feature_9 = model_11_kr.predict(xgtrain2)

In [None]:
#Train features
train_features = np.vstack((feature_1, feature_2, feature_3, feature_4, feature_5, feature_6, feature_7, feature_8, feature_9))
#Need to transpose features
train_features = train_features.T

In [None]:
#Test features
test_features = np.vstack((model_1_predict, model_2_predict, model_3_predict, model_4_predict, model_5_predict, model_6_predict, model_7_predict, model_8_predict, model_11_predict ))
#Need to transpose features
test_features = test_features.T

##First stack model: Extra Trees

In [None]:
model_13_et = ExtraTreesRegressor(n_estimators=750, max_depth=8, oob_score=True, n_jobs=6, random_state=123, verbose=1, bootstrap=True)
model_13_et.fit(train_features, label_pv)
model_13_predict = model_13_et.predict(test_features)

In [None]:
prediction_13 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_13_predict})
print prediction_13.head()
prediction_13.to_csv('Submission/model_13.csv', sep=",", index=False)

In [None]:
#rmse 696

##Second stack model: Ridge

In [None]:
model_14_ridge = linear_model.Ridge(alpha=0.01)
model_14_ridge.fit(train_features, label_pv)
model_14_predict = model_14_ridge.predict(test_features)
model_14_predict[model_14_predict<0] = 0

prediction_14 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_14_predict})
print prediction_14.head()
prediction_14.to_csv('Submission/model_14.csv', sep=",", index=False)

In [None]:
#rmse 740

##Third stack model: Model 3 (Extra Trees) + output features of all the 9 models

In [None]:
#Concatenating features + train/test dataset
xgtrain3 = np.hstack((xgtrain, train_features))
xgtest3  = np.hstack((xgtest, test_features))

print xgtrain3.shape, xgtest3.shape

In [None]:
model_15_et = ExtraTreesRegressor(n_estimators=750, max_depth=8, oob_score=True, n_jobs=6, random_state=123, verbose=1, bootstrap=True)
model_15_et.fit(xgtrain3, label_pv)
model_15_predict = model_15_et.predict(xgtest3)

prediction_15 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_15_predict})
print prediction_15.head()
prediction_15.to_csv('Submission/model_15.csv', sep=",", index=False)

In [None]:
min(model_15_predict)

In [None]:
#rmse 693

##Fourth stack model: Ridge

In [None]:
model_16_ridge = linear_model.Ridge(alpha=0.01)
model_16_ridge.fit(xgtrain3, label_pv)
model_16_predict = model_16_ridge.predict(xgtest3)
model_16_predict[model_16_predict<0] = 0

prediction_16 = pd.DataFrame({'ID':test.ID, 'Project_Valuation':model_16_predict})
print prediction_16.head()
prediction_16.to_csv('Submission/model_16.csv', sep=",", index=False)

In [None]:
#rmse 740