In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import os
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
%matplotlib inline

In [2]:
train = pd.read_csv('/Users/skylark/Desktop/upvotes/train.csv')
test = pd.read_csv('/Users/skylark/Desktop/upvotes/test.csv')

In [3]:
data_x = train.drop(['ID', 'Tag', 'Upvotes', 'Username'], axis=1)

In [6]:
data_x

Unnamed: 0,Reputation,Answers,Views
0,3942.0,2.0,7855.0
1,26046.0,12.0,55801.0
2,1358.0,4.0,8067.0
3,264.0,3.0,27064.0
4,4271.0,4.0,13986.0
...,...,...,...
330040,36.0,2.0,1063.0
330041,1649.0,2.0,23319.0
330042,6178.0,2.0,2453.0
330043,89.0,2.0,2107.0


In [4]:
data_test = test.drop(['ID', 'Tag', 'Username'], axis=1)

In [10]:
data_ind = pd.concat([data_x, data_test])

### Normalisation

In [5]:
def normalise(data):
    return (data-np.mean(data))/np.std(data)

In [8]:
def min_max(data):
    return (data - np.min(data))/(np.max(data)-np.min(data))

In [11]:
def log_reduction(data):
    d = data.copy()
    d[d>0] = np.log(d[d>0])
    return d

In [79]:
def reverse_log(data):
    d = data.copy()
    d[d>0] = np.exp(d[d>0])
    d[d<0] = 0
    return d

In [136]:
def normalise_target(data):
    return(data - np.mean(train.Upvotes))/np.std(train.Upvotes)

def rev_normal_target(data):
    return (data*np.std(train.Upvotes)) + np.mean(train.Upvotes)

### Feature Engineering

In [22]:
y = train.Upvotes

In [33]:
y = y+2e-7

In [40]:
y.mean()

337.50535855818964

In [44]:
data_normalise = normalise(data_ind)

In [45]:
data_min_max = min_max(data_ind)

In [46]:
data_log = log_reduction(data_ind)

### Result

In [132]:
def result_df(result):
    result_df = pd.DataFrame(columns=['ID', 'Upvotes'])
    result_df['ID'] = test.ID
    result_df['Upvotes'] = result
    return result_df.set_index('ID', drop=True)

## Models

In [51]:
from sklearn.metrics import r2_score, mean_squared_error
def rmse(true, predicted):
    return np.sqrt(mean_squared_error(true, predicted))

#### Linear Model

In [188]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

In [117]:
train_x_lin, test_x_lin, train_y_lin, test_y_lin = train_test_split(data_normalise[:330045], train.Upvotes, test_size=0.3, random_state=100)

In [118]:
lin_mod = LinearRegression()

In [137]:
lin_mod.fit(train_x_lin, normalise_target(train_y_lin))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [138]:
pred = lin_mod.predict(test_x_lin)

In [141]:
rev_normal_target(pred)

array([ 337.50535836,  599.25126436,  337.50535836, ...,  337.50535836,
       1284.34125989,  337.50535836])

In [139]:
pred[pred<0] = 0

In [142]:
rmse(test_y_lin, rev_normal_target(pred).astype('int32'))

2938.8910684959224

In [126]:
res = lin_mod.predict(data_normalise[330045:]).astype('int32')
res[res<0] = 0

In [134]:
result_df(res).to_csv('/Users/skylark/Desktop/upvotes/result.csv')

In [133]:
result_df(res)

Unnamed: 0_level_0,Upvotes
ID,Unnamed: 1_level_1
366953,356
71864,355
141692,19
316833,0
440445,571
...,...
47187,0
329126,162
282334,568
386629,0


#### RandomForest Reg

In [143]:
from sklearn.ensemble import RandomForestRegressor

In [144]:
    train_x_rdm, test_x_rdm, train_y_rdm, test_y_rdm = train_test_split(data_ind[:330045], train.Upvotes, test_size=0.3, random_state=100)

In [145]:
rdm_mod = RandomForestRegressor()

In [146]:
rdm_mod.fit(train_x_rdm, train_y_rdm)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [150]:
pred = rdm_mod.predict(test_x_rdm).astype('int32')

In [152]:
rmse(test_y_rdm, pred)

1182.1454505394156

In [157]:
result_df(rdm_mod.predict(data_ind[330045:]).astype('int32')).to_csv('/Users/skylark/Desktop/upvotes/result.csv')

In [160]:
params = {'n_estimators':range(10,200), 'max_depth': range(10, 200, 10), 'max_features':['auto', 'sqrt', 0.2, 0.5]}

In [161]:
rcv = RandomizedSearchCV(rdm_mod, param_distributions=params, cv=5, scoring='r2')
rcv.fit(train_x_rdm, train_y_rdm)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=10, n_jobs=None,
                                                   oob_score=False,
                                                   random_state=None,

In [162]:
rcv.best_params_

{'n_estimators': 25, 'max_features': 'auto', 'max_depth': 20}

In [163]:
rcv.best_score_

0.842093548057749

In [164]:
rdm_mod = RandomForestRegressor(n_estimators=25, max_features='auto', max_depth=20)

In [165]:
rdm_mod.fit(train_x_rdm, train_y_rdm)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=25,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [166]:
pred = rdm_mod.predict(test_x_rdm).astype('int32')

In [167]:
pred

array([  7, 218,  20, ...,   8,  56,  30], dtype=int32)

In [168]:
rmse(test_y_rdm, pred)

1152.3535233283626

In [169]:
result_df(rdm_mod.predict(data_ind[330045:]).astype('int32')).to_csv('/Users/skylark/Desktop/upvotes/result.csv')

### Boosting

In [170]:
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor

In [171]:
ada_mod = AdaBoostRegressor(RandomForestRegressor())

In [172]:
ada_mod.fit(train_x_rdm, train_y_rdm)

AdaBoostRegressor(base_estimator=RandomForestRegressor(bootstrap=True,
                                                       criterion='mse',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       n_estimators='warn',
                                                       n_jobs=None,
                                                       oob_score=False,
                    

In [174]:
pred = ada_mod.predict(test_x_rdm).astype('int32')

In [175]:
rmse(test_y_rdm, pred)

1189.72123284964

In [191]:
params = {'base_estimator':[RandomForestRegressor(), LinearRegression(), XGBRegressor()],'n_estimators': range(10, 200, 10), 'learning_rate':np.random.choice(np.array(range(10, 1000))/100, size=20)}

In [192]:
rdm_cv = RandomizedSearchCV(ada_mod, param_distributions=params, cv=5, scoring='r2')

In [194]:
rdm_cv.fit(train_x_rdm, train_y_rdm)



RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=AdaBoostRegressor(base_estimator=RandomForestRegressor(bootstrap=True,
                                                                                    criterion='mse',
                                                                                    max_depth=None,
                                                                                    max_features='auto',
                                                                                    max_leaf_nodes=None,
                                                                                    min_impurity_decrease=0.0,
                                                                                    min_impurity_split=None,
                                                                                    min_samples_leaf=1,
                                                                                    min_samples_split=2,
  

In [195]:
rdm_cv.best_params_

{'n_estimators': 150,
 'learning_rate': 0.42,
 'base_estimator': XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              importance_type='gain', learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
              n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)}

In [196]:
rdm_cv.best_score_

0.8417485031779833

In [197]:
ada_mod = AdaBoostRegressor(XGBRegressor(), n_estimators=150, learning_rate=0.42)

In [198]:
ada_mod.fit(train_x_rdm, train_y_rdm)



AdaBoostRegressor(base_estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1, gamma=0,
                                              importance_type='gain',
                                              learning_rate=0.1,
                                              max_delta_step=0, max_depth=3,
                                              min_child_weight=1, missing=None,
                                              n_estimators=100, n_jobs=1,
                                              nthread=None,
                                              objective='reg:linear',
                                              random_state=0, reg_alpha=0,
                                              reg_lambda=1, scale_pos_weight=1,
                                              seed=None, sile

In [199]:
pred = ada_mod.predict(test_x_rdm).astype('int32')

In [200]:
pred

array([  4, 101,   0, ...,   4, 428,  30], dtype=int32)

In [201]:
rmse(test_y_rdm, pred)

1212.16032754516

In [202]:
xgb_mod = XGBRegressor()

In [203]:
xgb_mod.fit(train_x_rdm, train_y_rdm)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [204]:
xgb_mod.predict(test_x_rdm)

array([-21.439371, 218.58356 , -21.532738, ..., -21.439371, 487.71167 ,
        13.930687], dtype=float32)

In [205]:
train_x_rdm

Unnamed: 0,Reputation,Answers,Views
277198,51300.0,2.0,1922.0
211769,1337.0,5.0,14354.0
287257,419.0,4.0,2312.0
57703,25.0,3.0,521.0
253787,289.0,4.0,97541.0
...,...,...,...
65615,708.0,3.0,9708.0
253799,696.0,12.0,235312.0
210755,859.0,3.0,11844.0
56088,88.0,2.0,21465.0


In [211]:
train.groupby('Tag')['Views'].sum()

Tag
a    1.007930e+09
c    1.938011e+09
h    6.981264e+08
i    5.121364e+08
j    3.127011e+09
o    1.964386e+08
p    1.440461e+09
r    1.497153e+08
s    6.018986e+08
x    1.124802e+08
Name: Views, dtype: float64

In [208]:
train

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views,Upvotes
0,52664,a,3942.0,2.0,155623,7855.0,42.0
1,327662,a,26046.0,12.0,21781,55801.0,1175.0
2,468453,c,1358.0,4.0,56177,8067.0,60.0
3,96996,a,264.0,3.0,168793,27064.0,9.0
4,131465,c,4271.0,4.0,112223,13986.0,83.0
...,...,...,...,...,...,...,...
330040,339800,c,36.0,2.0,84919,1063.0,0.0
330041,253800,c,1649.0,2.0,76730,23319.0,73.0
330042,210756,c,6178.0,2.0,91701,2453.0,15.0
330043,56089,j,89.0,2.0,80245,2107.0,3.0
