In [46]:
import datetime, time
import pytz
import os
import json
import statsmodels.api as sm
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor

In [5]:
pst_tz = pytz.timezone('America/Los_Angeles')

In [13]:
def read_data(file, min_time_stamp, max_time_stamp):
    posting_time = []
    num_retweets = []
    num_followers = []
    for line in file:
        data = json.loads(line)
        if(data['citation_date'] > min_time_stamp and data['citation_date'] < max_time_stamp):
            posting_time.append(data['citation_date'])
            num_retweets.append(data['metrics']['citations']['total'])
            num_followers.append(data['author']['followers'])
    file.close()
    return posting_time, num_retweets, num_followers

In [14]:
def extract_features(posting_time, num_retweets, num_followers, window):
    hours = int((max(posting_time)-min(posting_time))/window)+1
    tmp = np.zeros([hours, 5])
    start_time = min(posting_time)
    start_hour = (datetime.datetime.fromtimestamp(start_time, pst_tz)).hour

    for i in range(hours):
        tmp[i,4] = (start_hour+i)%24
        #print(i, tmp[i, 4])
    for i in range(len(posting_time)):
        tmp[int((posting_time[i]-start_time)/window), 0] += 1
        tmp[int((posting_time[i]-start_time)/window), 1] += num_retweets[i]
        tmp[int((posting_time[i]-start_time)/window), 2] += num_followers[i]
        if tmp[int((posting_time[i]-start_time)/window), 3] < num_followers[i]:
            tmp[int((posting_time[i]-start_time)/window), 3] = num_followers[i]
    return tmp

In [15]:
###############Question 6################

In [None]:
#Window 1
for hashtag in os.listdir('./ECE219_tweet_data'):
    print(hashtag)
    file = open('./ECE219_tweet_data/'+hashtag, encoding = 'utf8')
    posting_time, num_retweets, num_followers = read_data(file, 0, 1422806400)
    features = extract_features(posting_time, num_retweets, num_followers, 3600)
    data = features[:-1, :]
    target = features[1:, 0]
    cv = KFold(n_splits=5, shuffle = True)
    lr_1 = LinearRegression()
    param_grid = {}
    clf_lr_1 = GridSearchCV(lr_1, param_grid = param_grid, cv = cv, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1)
    #lr = clf_lr_1.best_estimator_
    pred = clf_lr_1.predict(data)
    print("For hashtag " + hashtag)
    print("The MSE is " + str(mean_squared_error(target,pred)))
    print("The R-squared error is " + str(r2_score(target, pred)))

In [75]:
#Window 1
for hashtag in os.listdir('./ECE219_tweet_data'):
    print(hashtag)
    file = open('./ECE219_tweet_data/'+hashtag, encoding = 'utf8')
    posting_time, num_retweets, num_followers = read_data(file, 0, 1422806400)
    features = extract_features(posting_time, num_retweets, num_followers, 3600)
    data = features[:-1, :]
    target = features[1:, 0]
    model = sm.OLS(target, data)
    results = model.fit()
    '''
    pred = results.predict(data)
    print("Hashtag " + hashtag)
    print("The average MSE is " + str(mean_squared_error(target,pred))) 
    print("The average r_sq is " + str(r2_score(target,pred)))
    '''
    cv = KFold(n_splits=5, shuffle = True)
    mse = []
    r_sq = []
    for train_index, test_index in cv.split(data):
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        X_train_const = sm.add_constant(X_train)
        X_test_const = sm.add_constant(X_test)
        model = sm.OLS(y_train, X_train)
        results = model.fit()
        pred = results.predict(X_test)
        r_sq.append(results.rsquared)
        mse.append(mean_squared_error(y_test,pred))
    print("The average MSE is " + str(sum(mse) / float(len(mse)))) 
    print("The average r_sq is " + str(sum(r_sq) / float(len(r_sq))))
    

tweets_#gohawks.txt
The average MSE is 1222461.3954537995
The average r_sq is 0.4470421025482354
tweets_#gopatriots.txt
The average MSE is 1769.2889890067977
The average r_sq is 0.7904808682224791
tweets_#nfl.txt
The average MSE is 102205.65895885338
The average r_sq is 0.6952269677523303
tweets_#patriots.txt
The average MSE is 738578.5459148253
The average r_sq is 0.5770958169356039
tweets_#sb49.txt
The average MSE is 12127.6324240358
The average r_sq is 0.8599160615934954
tweets_#superbowl.txt
The average MSE is 741351.9443085588
The average r_sq is 0.4677123084424011


In [39]:
#Window 2
for hashtag in os.listdir('./ECE219_tweet_data'):
    print(hashtag)
    file = open('./ECE219_tweet_data/'+hashtag, encoding = 'utf8')
    posting_time, num_retweets, num_followers = read_data(file, 1422806399, 1422849601)
    features = extract_features(posting_time, num_retweets, num_followers, 300)
    data = features[:-1, :]
    target = features[1:, 0]
    model = sm.OLS(target, data)
    results = model.fit()
    pred = results.predict(data)
    print("Hashtag " + hashtag)
    print("The average MSE is " + str(mean_squared_error(target,pred))) 
    print("The average r_sq is " + str(r2_score(target,pred)))

tweets_#gohawks.txt
Hashtag tweets_#gohawks.txt
The average MSE is 76893.90670096333
The average r_sq is 0.45798292756327574
tweets_#gopatriots.txt
Hashtag tweets_#gopatriots.txt
The average MSE is 16518.191143341515
The average r_sq is 0.37564028396937577
tweets_#nfl.txt
Hashtag tweets_#nfl.txt
The average MSE is 21008.23343375031
The average r_sq is 0.8181423247625373
tweets_#patriots.txt
Hashtag tweets_#patriots.txt
The average MSE is 710942.3256348752
The average r_sq is 0.6940793976341859
tweets_#sb49.txt
Hashtag tweets_#sb49.txt
The average MSE is 1284338.8877511474
The average r_sq is 0.8663416724509064
tweets_#superbowl.txt
Hashtag tweets_#superbowl.txt
The average MSE is 6551187.356034464
The average r_sq is 0.8950879874426892


In [76]:
#Window 2
for hashtag in os.listdir('./ECE219_tweet_data'):
    print(hashtag)
    file = open('./ECE219_tweet_data/'+hashtag, encoding = 'utf8')
    posting_time, num_retweets, num_followers = read_data(file, 1422806399, 1422849601)
    features = extract_features(posting_time, num_retweets, num_followers, 300)
    data = features[:-1, :]
    target = features[1:, 0]
    cv = KFold(n_splits=5, shuffle = True)
    mse = []
    r_sq = []
    for train_index, test_index in cv.split(data):
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        X_train_const = sm.add_constant(X_train)
        X_test_const = sm.add_constant(X_test)
        model = sm.OLS(y_train, X_train)
        results = model.fit()
        pred = results.predict(X_test)
        r_sq.append(results.rsquared)
        mse.append(mean_squared_error(y_test,pred))
    print("The average MSE is " + str(sum(mse) / float(len(mse)))) 
    print("The average r_sq is " + str(sum(r_sq) / float(len(r_sq))))

tweets_#gohawks.txt
The average MSE is 86092.92134734092
The average r_sq is 0.7485078494336763
tweets_#gopatriots.txt
The average MSE is 22915.354972052173
The average r_sq is 0.602384649703351
tweets_#nfl.txt
The average MSE is 24723.276282011713
The average r_sq is 0.9081517135430399
tweets_#patriots.txt
The average MSE is 828986.331600382
The average r_sq is 0.8871820136130785
tweets_#sb49.txt
The average MSE is 1466469.8749535275
The average r_sq is 0.9578258754082981
tweets_#superbowl.txt
The average MSE is 8348869.50812765
The average r_sq is 0.9384380783373739


In [40]:
#Window 3
for hashtag in os.listdir('./ECE219_tweet_data'):
    print(hashtag)
    file = open('./ECE219_tweet_data/'+hashtag, encoding = 'utf8')
    posting_time, num_retweets, num_followers = read_data(file, 1422849600, np.inf)
    features = extract_features(posting_time, num_retweets, num_followers, 3600)
    data = features[:-1, :]
    target = features[1:, 0]
    model = sm.OLS(target, data)
    results = model.fit()
    pred = results.predict(data)
    print("Hashtag " + hashtag)
    print("The average MSE is " + str(mean_squared_error(target,pred))) 
    print("The average r_sq is " + str(r2_score(target,pred)))

tweets_#gohawks.txt
Hashtag tweets_#gohawks.txt
The average MSE is 1931.2492904761975
The average r_sq is 0.8255437490846315
tweets_#gopatriots.txt
Hashtag tweets_#gopatriots.txt
The average MSE is 40.25106119864701
The average r_sq is 0.7723620771590896
tweets_#nfl.txt
Hashtag tweets_#nfl.txt
The average MSE is 17579.98010850649
The average r_sq is 0.7943049051257776
tweets_#patriots.txt
Hashtag tweets_#patriots.txt
The average MSE is 11049.653825899864
The average r_sq is 0.877137541700638
tweets_#sb49.txt
Hashtag tweets_#sb49.txt
The average MSE is 76561.14574733021
The average r_sq is 0.7912446290033053
tweets_#superbowl.txt
Hashtag tweets_#superbowl.txt
The average MSE is 114645.74922223277
The average r_sq is 0.8388723707007578


In [77]:
#Window 3
for hashtag in os.listdir('./ECE219_tweet_data'):
    print(hashtag)
    file = open('./ECE219_tweet_data/'+hashtag, encoding = 'utf8')
    posting_time, num_retweets, num_followers = read_data(file, 1422849600, np.inf)
    features = extract_features(posting_time, num_retweets, num_followers, 3600)
    data = features[:-1, :]
    target = features[1:, 0]
    cv = KFold(n_splits=5, shuffle = True)
    mse = []
    r_sq = []
    for train_index, test_index in cv.split(data):
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        X_train_const = sm.add_constant(X_train)
        X_test_const = sm.add_constant(X_test)
        model = sm.OLS(y_train, X_train)
        results = model.fit()
        pred = results.predict(X_test)
        r_sq.append(results.rsquared)
        mse.append(mean_squared_error(y_test,pred))
    print("The average MSE is " + str(sum(mse) / float(len(mse)))) 
    print("The average r_sq is " + str(sum(r_sq) / float(len(r_sq))))

tweets_#gohawks.txt
The average MSE is 18878.18903854661
The average r_sq is 0.885773618534998
tweets_#gopatriots.txt
The average MSE is 288.553480646758
The average r_sq is 0.8592542196630854
tweets_#nfl.txt
The average MSE is 20412.064151826253
The average r_sq is 0.9449551980350748
tweets_#patriots.txt
The average MSE is 21721.77820021574
The average r_sq is 0.9096556879514134
tweets_#sb49.txt
The average MSE is 229612.44607800903
The average r_sq is 0.8691657423465771
tweets_#superbowl.txt
The average MSE is 216720.54908045
The average r_sq is 0.9125016403588585


In [29]:
############# Question 7 ##################

In [42]:
#Window 1
posting_time = []
num_retweets = []
num_followers = []
pt = []
nrt = []
nf = []
for hashtag in os.listdir('./ECE219_tweet_data'):
    print(hashtag)
    file = open('./ECE219_tweet_data/'+hashtag, encoding = 'utf8')
    posting_time_tmp, num_retweets_tmp, num_followers_tmp = read_data(file, 0, 1422806400)
    pt.append(posting_time_tmp)
    nrt.append(num_retweets_tmp)
    nf.append(num_followers_tmp)

posting_time = [item for sublist in pt for item in sublist]
num_retweets = [item for sublist in nrt for item in sublist]
num_followers = [item for sublist in nf for item in sublist]
features = extract_features(posting_time, num_retweets, num_followers, 3600)
data = features[:-1, :]
target = features[1:, 0]
model = sm.OLS(target, data)
results = model.fit()
pred = results.predict(data)
print("Hashtag " + hashtag)
print("The average MSE is " + str(mean_squared_error(target,pred))) 
print("The average r_sq is " + str(r2_score(target,pred)))

tweets_#gohawks.txt
tweets_#gopatriots.txt
tweets_#nfl.txt
tweets_#patriots.txt
tweets_#sb49.txt
tweets_#superbowl.txt
Hashtag tweets_#superbowl.txt
The average MSE is 4488914.282077933
The average r_sq is 0.4015683630172112


In [None]:
#Window 1
posting_time = []
num_
for hashtag in os.listdir('./ECE219_tweet_data'):
    print(hashtag)
    file = open('./ECE219_tweet_data/'+hashtag, encoding = 'utf8')
    posting_time_t, num_retweets_t, num_followers_t = read_data(file, 0, 1422806400)
    
    features = extract_features(posting_time, num_retweets, num_followers, 3600)
    data = features[:-1, :]
    target = features[1:, 0]
    cv = KFold(n_splits=5, shuffle = True)
    mse = []
    r_sq = []
    for train_index, test_index in cv.split(data):
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        X_train_const = sm.add_constant(X_train)
        X_test_const = sm.add_constant(X_test)
        model = sm.OLS(y_train, X_train)
        results = model.fit()
        pred = results.predict(X_test)
        r_sq.append(results.rsquared)
        mse.append(mean_squared_error(y_test,pred))
    print("The average MSE is " + str(sum(mse) / float(len(mse)))) 
    print("The average r_sq is " + str(sum(r_sq) / float(len(r_sq))))

In [43]:
#Window 2
posting_time = []
num_retweets = []
num_followers = []
pt = []
nrt = []
nf = []
for hashtag in os.listdir('./ECE219_tweet_data'):
    print(hashtag)
    file = open('./ECE219_tweet_data/'+hashtag, encoding = 'utf8')
    posting_time_tmp, num_retweets_tmp, num_followers_tmp = read_data(file, 1422806399, 1422849601)
    pt.append(posting_time_tmp)
    nrt.append(num_retweets_tmp)
    nf.append(num_followers_tmp)

posting_time = [item for sublist in pt for item in sublist]
num_retweets = [item for sublist in nrt for item in sublist]
num_followers = [item for sublist in nf for item in sublist]
features = extract_features(posting_time, num_retweets, num_followers, 300)
data2 = features[:-1, :]
target2 = features[1:, 0]
model = sm.OLS(target2, data2)
results = model.fit()
pred = results.predict(data2)
print("Hashtag " + hashtag)
print("The average MSE is " + str(mean_squared_error(target2,pred))) 
print("The average r_sq is " + str(r2_score(target2,pred)))

tweets_#gohawks.txt
tweets_#gopatriots.txt
tweets_#nfl.txt
tweets_#patriots.txt
tweets_#sb49.txt
tweets_#superbowl.txt
Hashtag tweets_#superbowl.txt
The average MSE is 16540671.67774716
The average r_sq is 0.8547765789340108


In [44]:
#Window 3
posting_time = []
num_retweets = []
num_followers = []
pt = []
nrt = []
nf = []
for hashtag in os.listdir('./ECE219_tweet_data'):
    print(hashtag)
    file = open('./ECE219_tweet_data/'+hashtag, encoding = 'utf8')
    posting_time_tmp, num_retweets_tmp, num_followers_tmp = read_data(file, 1422849600, np.inf)
    pt.append(posting_time_tmp)
    nrt.append(num_retweets_tmp)
    nf.append(num_followers_tmp)

posting_time = [item for sublist in pt for item in sublist]
num_retweets = [item for sublist in nrt for item in sublist]
num_followers = [item for sublist in nf for item in sublist]
features = extract_features(posting_time, num_retweets, num_followers, 3600)
data3 = features[:-1, :]
target3 = features[1:, 0]
model = sm.OLS(target3, data3)
results = model.fit()
pred = results.predict(data3)
print("Hashtag " + hashtag)
print("The average MSE is " + str(mean_squared_error(target3,pred))) 
print("The average r_sq is " + str(r2_score(target3,pred)))

tweets_#gohawks.txt
tweets_#gopatriots.txt
tweets_#nfl.txt
tweets_#patriots.txt
tweets_#sb49.txt
tweets_#superbowl.txt
Hashtag tweets_#superbowl.txt
The average MSE is 467502.89361684927
The average r_sq is 0.8546931178598383


In [45]:
################### Question 8 ####################

In [47]:
cv = KFold(5, shuffle=True)
rf_param_grid = {'max_depth': [10, 20, 40, 60, 80, 100, 200, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [200, 400, 600, 800, 1000,
1200, 1400, 1600, 1800, 2000]}

In [48]:
# all Window
posting_time = []
num_retweets = []
num_followers = []
pt = []
nrt = []
nf = []
for hashtag in os.listdir('./ECE219_tweet_data'):
    print(hashtag)
    file = open('./ECE219_tweet_data/'+hashtag, encoding = 'utf8')
    posting_time_tmp, num_retweets_tmp, num_followers_tmp = read_data(file, 0, np.inf)
    pt.append(posting_time_tmp)
    nrt.append(num_retweets_tmp)
    nf.append(num_followers_tmp)

posting_time = [item for sublist in pt for item in sublist]
num_retweets = [item for sublist in nrt for item in sublist]
num_followers = [item for sublist in nf for item in sublist]
features = extract_features(posting_time, num_retweets, num_followers, 3600)
data_agg = features[:-1, :]
target_agg = features[1:, 0]
model = sm.OLS(target_agg, data_agg)
results = model.fit()
pred = results.predict(data_agg)
print("Hashtag " + hashtag)
print("The average MSE is " + str(mean_squared_error(target_agg,pred))) 
print("The average r_sq is " + str(r2_score(target_agg,pred)))

tweets_#gohawks.txt
tweets_#gopatriots.txt
tweets_#nfl.txt
tweets_#patriots.txt
tweets_#sb49.txt
tweets_#superbowl.txt
Hashtag tweets_#superbowl.txt
The average MSE is 140984153.0801042
The average r_sq is 0.8228192296375999


In [49]:
regr_agg = RandomForestRegressor()
clf_agg = GridSearchCV(regr_agg, rf_param_grid, cv = cv, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1)
clf_agg.fit(data_agg[:-1, :], data_agg[1:, 0])

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 21.5min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 30.0min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 39.2min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 49.3min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 61.5min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 73.9min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 86.6min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 86.7min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [10, 20, 40, 60, 80, 100, 200, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [50]:
clf_agg.best_params_

{'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 400}

In [72]:
clf_agg.best_score_

-256077064.27515984

In [51]:
gb_agg = GradientBoostingRegressor()
clf_gb_agg = GridSearchCV(gb_agg, rf_param_grid, cv = cv, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1)
clf_gb_agg.fit(data_agg[:-1, :], data_agg[1:, 0])

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   49.5s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 17.0min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 22.1min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 27.7min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 33.5min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 40.2min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 40.3min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [10, 20, 40, 60, 80, 100, 200, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [52]:
clf_gb_agg.best_params_

{'max_depth': 40,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 200}

In [71]:
clf_gb_agg.best_score_

-4084466.547193123

In [53]:
################## Question 9 ####################

In [57]:
model = sm.OLS(data_agg[1:, 0], data_agg[:-1, :])
results = model.fit()
pred = results.predict(data_agg[:-1, :])
print("Hashtag " + hashtag)
print("The average MSE is " + str(mean_squared_error(data_agg[1:, 0],pred))) 
#print("The average r_sq is " + str(r2_score(target2,pred)))

Hashtag tweets_#superbowl.txt
The average MSE is 141225135.3167885


In [58]:
############### Question 10 ##################

In [61]:
gb_agg_1 = GradientBoostingRegressor()
clf_gb_agg_1 = GridSearchCV(gb_agg_1, rf_param_grid, cv = cv, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1)
clf_gb_agg_1.fit(data, target)

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   34.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 21.4min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 25.5min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 25.6min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [10, 20, 40, 60, 80, 100, 200, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [62]:
clf_gb_agg_1.best_params_

{'max_depth': 80,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 200}

In [70]:
clf_gb_agg_1.best_score_

-3996040.460529092

In [63]:
gb_agg_2 = GradientBoostingRegressor()
clf_gb_agg_2 = GridSearchCV(gb_agg_2, rf_param_grid, cv = cv, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1)
clf_gb_agg_2.fit(data2, target2)

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   45.6s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 11.5min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [10, 20, 40, 60, 80, 100, 200, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [66]:
clf_gb_agg_2.best_params_

{'max_depth': 80,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 600}

In [69]:
clf_gb_agg_2.best_score_

-26686643.837133154

In [64]:
gb_agg_3 = GradientBoostingRegressor()
clf_gb_agg_3 = GridSearchCV(gb_agg_3, rf_param_grid, cv = cv, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1)
clf_gb_agg_3.fit(data3, target3)

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   45.1s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 11.2min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise-deprecating',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [10, 20, 40, 60, 80, 100, 200, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [65]:
clf_gb_agg_3.best_params_

{'max_depth': 100,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 800}

In [67]:
clf_gb_agg_3.cv_results_



{'mean_fit_time': array([0.11359973, 0.14479918, 0.1895998 , ..., 0.55839982, 0.61519895,
        0.64244761]),
 'std_fit_time': array([0.01091051, 0.00159993, 0.00407945, ..., 0.01091075, 0.01462067,
        0.0433008 ]),
 'mean_score_time': array([0.00159984, 0.00079999, 0.00079994, ..., 0.00560322, 0.00480061,
        0.00480037]),
 'std_score_time': array([0.00195939, 0.00159998, 0.00159988, ..., 0.00195757, 0.00160072,
        0.00159976]),
 'param_max_depth': masked_array(data=[10, 10, 10, ..., None, None, None],
              mask=[False, False, False, ..., False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_features': masked_array(data=['auto', 'auto', 'auto', ..., 'sqrt', 'sqrt', 'sqrt'],
              mask=[False, False, False, ..., False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[1, 1, 1, ..., 4, 4, 4],
              mask=[False, False, False, ..., False, False, False],
 

In [68]:
clf_gb_agg_3.best_score_

-383874.0672352446