In [1]:
import pandas as pd
import numpy as np

In [2]:
def predict(x, w, w0):
    return np.dot(x, w) + w0

def predict_all(X, w, w0):
    return np.dot(X, w) + w0

In [3]:
def gradient_step(X, y, w, w0, k, eta = 1):
    y_pred = predict_all(X, w, w0)
    grad = 2 / y.size * np.dot(X.T, (y_pred - y))
    grad_0 = 2 / y.size * np.sum(y_pred - y)
    return  w - eta / k * grad, w0 - eta / k * grad_0

In [4]:
def mserror(y, y_pred):
    return np.sum((y_pred  - y) * (y_pred  - y)) / y.size

def rmserror(y, y_pred):
    return np.sqrt(mserror(y, y_pred))

In [5]:
def gradient_descent(X, y, eta=1, max_iter=1e4, min_weight_dist=1e-4):
    weight_dist = np.inf
    w = np.zeros(X.shape[1])
    w0 = 0
    iter_num = 0
    
    while (weight_dist > min_weight_dist) & (iter_num < max_iter):        
        w_next, w0_next = gradient_step(X, y, w, w0, iter_num+1, eta)
        weight_dist = np.sqrt(np.sum((w - w_next) ** 2) + (w0 - w0_next) ** 2)
        w = w_next
        w0 = w0_next
        iter_num += 1
    return w, w0

In [6]:
def tss(y):
    return np.sum((y - np.mean(y)) ** 2)

def rss(y, y_pred):
    return np.sum((y_pred  - y) ** 2)

def r2(y, y_pred):
    return 1 - rss(y, y_pred) / tss(y)

In [7]:
def cross_validation(X, n_samples=5):
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    sample_size = X.shape[0] // n_samples
    samples = []
    for i in range(1, n_samples):
        test = np.zeros(X.shape[0], dtype=bool)
        test[indices[(i - 1) * sample_size : i * sample_size]] = True
        train = ~test
        samples.append([train, test])
    
    test = np.zeros(X.shape[0], dtype=bool)
    test[indices[(n_samples - 1) * sample_size :]] = True
    train = ~test
    samples.append([train, test])
    return samples

In [8]:
data = pd.read_csv('dataset.csv', index_col=0) # Facebook Comment Volume Dataset

In [9]:
data.head()

Unnamed: 0,Page Popularity,Page Checkins,Page talking about,Page Category,extra_0,extra_1,extra_2,extra_3,extra_4,extra_5,...,published_weekday_5,published_weekday_6,base_weekday_0,base_weekday_1,base_weekday_2,base_weekday_3,base_weekday_4,base_weekday_5,base_weekday_6,Target
0,634995,0,463,1,0.0,806.0,11.291045,1.0,70.495138,0.0,...,0,0,0,0,0,0,0,0,1,0
1,634995,0,463,1,0.0,806.0,11.291045,1.0,70.495138,0.0,...,0,0,0,0,0,0,0,1,0,0
2,634995,0,463,1,0.0,806.0,11.291045,1.0,70.495138,0.0,...,1,0,0,0,0,0,0,0,1,0
3,634995,0,463,1,0.0,806.0,11.291045,1.0,70.495138,0.0,...,1,0,0,1,0,0,0,0,0,0
4,634995,0,463,1,0.0,806.0,11.291045,1.0,70.495138,0.0,...,0,0,0,0,0,1,0,0,0,0


In [10]:
data.describe()

Unnamed: 0,Page Popularity,Page Checkins,Page talking about,Page Category,extra_0,extra_1,extra_2,extra_3,extra_4,extra_5,...,published_weekday_5,published_weekday_6,base_weekday_0,base_weekday_1,base_weekday_2,base_weekday_3,base_weekday_4,base_weekday_5,base_weekday_6,Target
count,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,...,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0,40949.0
mean,1313814.0,4676.133752,44800.25,24.25478,1.586241,443.333854,55.720384,35.645535,67.464151,0.219468,...,0.146157,0.136926,0.14164,0.132506,0.137635,0.148599,0.150846,0.143886,0.144888,7.322889
std,6785752.0,20593.184863,110933.8,19.950583,20.753174,496.695198,86.933548,69.960232,81.568249,10.055146,...,0.353268,0.343774,0.348684,0.339045,0.34452,0.355698,0.357903,0.350979,0.351992,35.49455
min,36.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,36734.0,0.0,698.0,9.0,0.0,45.0,5.527273,2.0,8.278756,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,292911.0,0.0,7045.0,18.0,0.0,241.0,23.374101,12.0,35.06914,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1204214.0,99.0,50264.0,32.0,0.0,717.0,71.828829,42.0,102.554954,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
max,486972300.0,186370.0,6089942.0,106.0,2341.0,2341.0,2341.0,2341.0,731.394558,1923.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1305.0


In [11]:
X = data.loc[:, (data.columns != 'Target') & (data.columns != 'Post Promotion Status')].values

In [12]:
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
X[:10, :]

array([[-0.10003712, -0.22707468, -0.39967795, -1.16563332, -0.07643461,
         0.73016727, -0.51107866, -0.4952236 ,  0.03715936, -0.02182671,
         1.39092118, -0.39566576, -0.34454749,  0.53358809, -0.0121651 ,
        -0.58814396, -0.54818792, -0.37155907, -0.5944117 , -0.08000176,
         0.82689628, -0.51662119, -0.49948956,  0.09292668,  0.53598602,
         1.42126775,  0.13868517,  0.13728321,  0.18980545, -0.40679524,
        -0.28767753, -0.27645357, -0.41061216, -0.02693112,  1.41887081,
         0.00623912, -0.12195811,  0.12093249, -0.37336695, -0.40898659,
        -0.41966756,  2.31293463, -0.41020616, -0.41373429, -0.39830863,
        -0.40621645, -0.39082747, -0.39950129, -0.41777427, -0.42147708,
        -0.40996237,  2.42938358],
       [-0.10003712, -0.22707468, -0.39967795, -1.16563332, -0.07643461,
         0.73016727, -0.51107866, -0.4952236 ,  0.03715936, -0.02182671,
         1.39092118, -0.39566576, -0.34454749,  0.53358809, -0.0121651 ,
        -0.58814

In [13]:
y = data['Target'].values

In [14]:
weights = []
r2_train = []
r2_test = []
rmse_train = []
rmse_test = []

for train, test in cross_validation(X):
    w, w0 = gradient_descent(X[train], y[train])
    r2_train_i = r2(y[train], predict_all(X[train], w, w0))
    r2_test_i = r2(y[test], predict_all(X[test], w, w0))
    rmse_train_i = rmserror(y[train], predict_all(X[train], w, w0))
    rmse_test_i = rmserror(y[test], predict_all(X[test], w, w0))
    print('R2 train: %.2f, R2 test: %.2f' % (r2_train_i, r2_test_i))
    weights.append(w)
    r2_train.append(r2_train_i)
    r2_test.append(r2_test_i)
    rmse_train.append(rmse_train_i)
    rmse_test.append(rmse_test_i)

R2 train: 0.31, R2 test: 0.40
R2 train: 0.33, R2 test: 0.30
R2 train: 0.32, R2 test: 0.32
R2 train: 0.34, R2 test: 0.22
R2 train: 0.32, R2 test: 0.27


In [15]:
result_dictionary = {'R2 train': r2_train, 'R2 test': r2_test, 'RMSE train': rmse_train, 'RMSE test': rmse_test}
result = pd.DataFrame(result_dictionary).T

features = data.columns.values[(data.columns != 'Target') & (data.columns != 'Post Promotion Status')]
result = result.append(pd.DataFrame(data=weights).T.set_index(features))

In [16]:
result['E'] = result.mean(axis=1)
result['std'] = result.std(axis=1)

In [17]:
result

Unnamed: 0,0,1,2,3,4,E,std
R2 train,0.308666,0.326978,0.321895,0.338586,0.31974,0.323173,0.009756
R2 test,0.400284,0.301537,0.315456,0.22156,0.269206,0.301609,0.058968
RMSE train,30.698466,27.332251,29.126539,28.870758,29.905073,29.186617,1.126403
RMSE test,22.521836,36.029007,29.772677,31.297082,27.576078,29.439336,4.434177
Page Popularity,0.311013,0.617815,0.131054,0.247943,-1.503422,-0.039119,0.749627
Page Checkins,-0.438672,-0.468539,-0.572953,-0.394744,-0.709694,-0.516921,0.112864
Page talking about,-2.159469,-1.935588,-1.468953,-1.911263,-0.593169,-1.613688,0.557217
Page Category,0.001342,-0.039059,-0.152052,-0.055011,-0.14092,-0.07714,0.059631
extra_0,0.525278,0.274142,-0.139276,-0.253222,-0.464091,-0.011434,0.360366
extra_1,0.904722,0.554301,0.765538,1.114138,0.504263,0.768592,0.22527


In [18]:
result.to_csv('result_table.csv')

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [20]:
for train, test in cross_validation(X):
    w, w0 = gradient_descent(X[train], y[train])
    lr = LinearRegression()
    lr.fit(X[train], y[train])
    print('Моя:', r2(y[test], predict_all(X[test], w, w0)))
    print('sklearn:', r2_score(y[test], lr.predict(X[test])))

Моя: 0.34886733479631693
sklearn: 0.34476620821963533
Моя: 0.3325659598063865
sklearn: 0.327076901110409
Моя: 0.23994571573002033
sklearn: 0.2460189567756117
Моя: 0.18165422737079429
sklearn: 0.17412663760704994
Моя: 0.35834455376518515
sklearn: 0.36285833892400243
