In [2]:
import json
import datetime
import pytz
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from statistics import mean
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from statistics import mean
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import make_regression
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler  
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor

In [3]:
path = "tweet_data/"

files = ["tweets_#gohawks.txt", "tweets_#gopatriots.txt", \
        "tweets_#nfl.txt", "tweets_#patriots.txt", \
        "tweets_#sb49.txt", "tweets_#superbowl.txt"]
topics = ["gohawks", "gopatriots", "nfl", "patriots", "sb49", "superbowl"]
test_files =["sample0_period1.txt","sample0_period2.txt","sample0_period3.txt", \
             "sample1_period1.txt","sample1_period2.txt","sample1_period3.txt","sample2_period1.txt",\
             "sample2_period2.txt","sample2_period3.txt"]

for i, fl in enumerate(files):
    print("files[" + str(i) + "] => " + fl)

files[0] => tweets_#gohawks.txt
files[1] => tweets_#gopatriots.txt
files[2] => tweets_#nfl.txt
files[3] => tweets_#patriots.txt
files[4] => tweets_#sb49.txt
files[5] => tweets_#superbowl.txt


In [4]:
def parse_dataset_ten_features(file):
    """
    Prase x conponent of the dataset into pandas DataFrame including columns of:
    tweets, retweets, total_followers, max_followers, mentioned, media, active, author, favourites_count, title
    with lines of hours,
    where mentioned: number of @ in tweets per hour
          media: number of url in tweets per hour
          active: a measure of active state of author
          author: number of unique authors post tweet per hour
          favourites_count: the total number of likes by this user
          title: length of this tweet's title
    Prase y of dataset as number of tweets in the next hour.
    """
   
    # extract raw features
    pst_tz = pytz.timezone('America/Los_Angeles')
    data_raw = []
    for line in open(path + file, 'r', encoding="utf-8") :
        row_tmp = []
        a = json.loads(line)
        time = a['citation_date']
        retweet = a['metrics']['citations']['total']
        foll = a['author']['followers']             
        ment = len(a['tweet']['entities']['user_mentions'])        
        medi = len(a['tweet']['extended_entities']['media']) if 'extended_entities' in a['tweet'] else 0
        hist_tw = a['tweet']['user']["statuses_count"]
        hist_yr = a['tweet']['user']['created_at'][-4:]
        acti = hist_tw/(2015-float(hist_yr)+1) 
        auth = a['author']['name']
        favo = a['tweet']['user']['favourites_count']
        titl = len(a['title'])
        
        # append to list
        row_tmp.append(time)
        row_tmp.append(retweet)
        row_tmp.append(foll)    
        row_tmp.append(ment) 
        row_tmp.append(medi) 
        row_tmp.append(acti)  
        row_tmp.append(auth)
        row_tmp.append(favo)
        row_tmp.append(titl)        
        data_raw.append(row_tmp)
    
    # sort according to time
    pddata_raw = pd.DataFrame(data_raw,columns=['time','retweets','followers','mentioned','media',\
                                                'active','author','favourites_count','title'])
    pddata_raw = pddata_raw.sort_values(by = 'time')
    pddata_raw = pddata_raw.reset_index(drop=True)
    pddata_raw['tweets'] = 1                   
#     print(pddata_raw)

    # reset time to hour index
    hour_accu = []
    hour_day = []
    for index, row in pddata_raw.iterrows():  
        p = datetime.datetime.fromtimestamp(row["time"], pst_tz)  
        hour_accu.append(((p.month-1)*31+p.day-14)*24+p.hour)
        hour_day.append(p.hour)    
    pddata_raw["time"] = hour_accu
    pddata_raw["hour of day"] = hour_day
    
    # create a new dataframe with desired form
    df = pd.DataFrame([],columns=['hour index','tweets','retweets','followers sum','followers max',\
                                  'mentioned','media','active','author','favourites_count','title'])
    df['hour index'] = range(pddata_raw.iloc[len(pddata_raw.index)-1,0]+1)
    df['tweets'] = pddata_raw.groupby("time")['tweets'].sum()
    df['retweets'] = pddata_raw.groupby("time")['retweets'].sum()
    df['followers sum'] = pddata_raw.groupby("time")['followers'].sum()
    df['followers max'] = pddata_raw.groupby("time")["followers"].max()
    df['mentioned'] = pddata_raw.groupby("time")['mentioned'].sum()
    df['media'] = pddata_raw.groupby("time")['media'].sum()
    df['active'] = pddata_raw.groupby("time")['active'].mean()  
    df['author'] = pddata_raw.groupby("time")['author'].nunique() # count number of not-repeating authors    
    df['favourites_count'] = pddata_raw.groupby("time")['favourites_count'].sum()
    df['title'] = pddata_raw.groupby("time")['title'].mean()
            
    # reset index of df
    df = df.drop([0]).fillna(0).reset_index(drop=True)

    # assign number of tweets of the next hour to be the target value
    df_y = df.iloc[1:,1].reset_index(drop=True)
    df = df[:len(df_y)]
    
    return df.iloc[:,1:],df_y

In [5]:
def plot_recipe(df_y, pred_y):
    """
    This function plots fitted values vs true values
    """
    plt.figure()
    area = np.pi * (4)**2/4
    plt.scatter(df_y, pred_y, s = area)
    plt.plot([df_y.min(), df_y.max()], [df_y.min(), df_y.max()], 'k--', lw = 1)
    plt.xlabel('true values')
    plt.ylabel('fitted values')
    plt.show()

In [6]:

def ols_regression(df, df_y):
    X2 = sm.add_constant(df)
    y = df_y.as_matrix()
    lm = sm.OLS(y, X2).fit()
    print(lm.summary())
    print(list(df))

In [7]:
#aggregate all data 
df, df_y = parse_dataset_ten_features(files[0])
for i in range(1,6):
    df_temp,df_temp_y=parse_dataset_ten_features(files[i])
    df=df.append(df_temp,ignore_index=True)
    df_y=df_y.append(df_temp_y,ignore_index=True)
    print(df_temp.shape,df.shape)

(574, 10) (1151, 10)
(585, 10) (1736, 10)
(585, 10) (2321, 10)
(585, 10) (2906, 10)
(585, 10) (3491, 10)


In [8]:

kf = KFold(n_splits=5,random_state=42,shuffle=True)

# Neural network

In [9]:
hidden_units=[50,100,200,300,500,600]

In [10]:

avg_RMSE2_train=np.zeros(6)
avg_RMSE2_test=np.zeros(6)
for n1,hidden_layer_sizes in enumerate(hidden_units):
    MSE_train=[]
    MSE_test=[]
    total_train=0
    total_test=0
    for train_index, test_index in kf.split(df):
        X_train= df.iloc[train_index]
        y_train= df_y.iloc[train_index]
        X_test= df.iloc[test_index]
        y_test= df_y.iloc[test_index]
        reg = MLPRegressor(hidden_layer_sizes=(hidden_layer_sizes,),activation='relu', solver='adam', alpha=1e-5, random_state=42)            
        reg.fit(X_train,y_train)
        pred_train = reg.predict(X_train)
        pred_test = reg.predict(X_test)
        MSE_train.append(mean_squared_error(y_train, pred_train)*len(train_index))
        MSE_test.append(mean_squared_error(y_test, pred_test)*len(test_index))
        total_train=total_train+len(train_index)
        total_test=total_test+len(test_index)
    avg_RMSE2_test[n1]=np.sqrt(sum(MSE_test)/total_test)
    avg_RMSE2_train[n1]=np.sqrt(sum(MSE_train)/total_train)
    print("hidden layer sizes=",hidden_layer_sizes)
    print("RMSE_train=",avg_RMSE2_train[n1],"RMSE_test=",avg_RMSE2_test[n1])
    print("")

hidden layer sizes= 50
RMSE_train= 93092.01952988436 RMSE_test= 109192.2180396753

hidden layer sizes= 100
RMSE_train= 98454.99446626086 RMSE_test= 109470.50550361461

hidden layer sizes= 200
RMSE_train= 135745.87355179316 RMSE_test= 147830.46142238282

hidden layer sizes= 300
RMSE_train= 796876.5735459172 RMSE_test= 956341.3975176685

hidden layer sizes= 500
RMSE_train= 1245878.9194948194 RMSE_test= 1405805.2422495356

hidden layer sizes= 600
RMSE_train= 1454415.435327531 RMSE_test= 1314767.5455997884



In [11]:
print("min RMSE in testset=",np.min(avg_RMSE2_test))
print("hidden layer sizes=",hidden_units[np.argmin(avg_RMSE2_test)])

min RMSE in testset= 109192.2180396753
hidden layer sizes= 50


In [12]:
reg = MLPRegressor(hidden_layer_sizes=(hidden_units[np.argmin(avg_RMSE2_test)],),activation='relu', solver='adam', alpha=1e-5, random_state=42) 
reg.fit(df,df_y)
pred = reg.predict(df)
print("MSE=",mean_squared_error(df_y, pred))

MSE= 1493991680.3029842


In [13]:

avg_RMSE3_train=np.zeros(6)
avg_RMSE3_test=np.zeros(6)
for n1,hidden_layer_sizes in enumerate(hidden_units):
    MSE_train=[]
    MSE_test=[]
    total_train=0
    total_test=0
    for train_index, test_index in kf.split(df):
        X_train= df.iloc[train_index]
        y_train= df_y.iloc[train_index]
        X_test= df.iloc[test_index]
        y_test= df_y.iloc[test_index]
        reg = MLPRegressor(hidden_layer_sizes=(50,hidden_layer_sizes,),activation='relu', solver='adam', alpha=1e-5, random_state=42)            
        reg.fit(X_train,y_train)
        pred_train = reg.predict(X_train)
        pred_test = reg.predict(X_test)
        MSE_train.append(mean_squared_error(y_train, pred_train)*len(train_index))
        MSE_test.append(mean_squared_error(y_test, pred_test)*len(test_index))
        total_train=total_train+len(train_index)
        total_test=total_test+len(test_index)
    avg_RMSE3_test[n1]=np.sqrt(sum(MSE_test)/total_test)
    avg_RMSE3_train[n1]=np.sqrt(sum(MSE_train)/total_train)
    print("hidden layer sizes=",hidden_layer_sizes)
    print("RMSE_train=",avg_RMSE3_train[n1],"RMSE_test=",avg_RMSE3_test[n1])
    print("")

hidden layer sizes= 50
RMSE_train= 870742.3002021678 RMSE_test= 918191.8446974328

hidden layer sizes= 100
RMSE_train= 759347.6447368438 RMSE_test= 908047.6200404303

hidden layer sizes= 200
RMSE_train= 1902596.6122279984 RMSE_test= 1746900.2677409204

hidden layer sizes= 300
RMSE_train= 735796.1332734381 RMSE_test= 858461.2409409373

hidden layer sizes= 500
RMSE_train= 1907991.7108592603 RMSE_test= 1647432.8412819405

hidden layer sizes= 600
RMSE_train= 152839.14278786935 RMSE_test= 201978.7064490191



In [14]:
print("min RMSE in testset=",np.min(avg_RMSE3_test))
print("hidden layer sizes=",hidden_units[np.argmin(avg_RMSE3_test)])

min RMSE in testset= 201978.7064490191
hidden layer sizes= 600


In [16]:
reg = MLPRegressor(hidden_layer_sizes=(50,hidden_units[np.argmin(avg_RMSE3_test)],),activation='relu', solver='adam', alpha=1e-5, random_state=42) 
reg.fit(df,df_y)
pred = reg.predict(df)
print("MSE=",mean_squared_error(df_y, pred))

MSE= 1435417557274.456


In [17]:

avg_RMSE4_train=np.zeros(6)
avg_RMSE4_test=np.zeros(6)
for n1,hidden_layer_sizes in enumerate(hidden_units):
    MSE_train=[]
    MSE_test=[]
    total_train=0
    total_test=0
    for train_index, test_index in kf.split(df):
        X_train= df.iloc[train_index]
        y_train= df_y.iloc[train_index]
        X_test= df.iloc[test_index]
        y_test= df_y.iloc[test_index]
        reg = MLPRegressor(hidden_layer_sizes=(50,600,hidden_layer_sizes,),activation='relu', solver='adam', alpha=1e-5, random_state=42)            
        reg.fit(X_train,y_train)
        pred_train = reg.predict(X_train)
        pred_test = reg.predict(X_test)
        MSE_train.append(mean_squared_error(y_train, pred_train)*len(train_index))
        MSE_test.append(mean_squared_error(y_test, pred_test)*len(test_index))
        total_train=total_train+len(train_index)
        total_test=total_test+len(test_index)
    avg_RMSE4_test[n1]=np.sqrt(sum(MSE_test)/total_test)
    avg_RMSE4_train[n1]=np.sqrt(sum(MSE_train)/total_train)
    print("hidden layer sizes=",hidden_layer_sizes)
    print("RMSE_train=",avg_RMSE4_train[n1],"RMSE_test=",avg_RMSE4_test[n1])
    print("")

hidden layer sizes= 50
RMSE_train= 605009.2552429812 RMSE_test= 1028205.4110832777

hidden layer sizes= 100
RMSE_train= 581915.8177173582 RMSE_test= 851870.3980124992

hidden layer sizes= 200
RMSE_train= 370703.05367547716 RMSE_test= 328678.5981202987

hidden layer sizes= 300
RMSE_train= 199592.2255822874 RMSE_test= 225363.95533437002

hidden layer sizes= 500
RMSE_train= 524338.6640544101 RMSE_test= 247449.4941913326

hidden layer sizes= 600
RMSE_train= 748144.2736463654 RMSE_test= 635336.5516964127



In [18]:
print("min RMSE in testset=",np.min(avg_RMSE4_test))
print("hidden layer sizes=",hidden_units[np.argmin(avg_RMSE4_test)])

min RMSE in testset= 225363.95533437002
hidden layer sizes= 300


In [19]:
reg = MLPRegressor(hidden_layer_sizes=(50,600,hidden_units[np.argmin(avg_RMSE4_test)],),activation='relu', solver='adam', alpha=1e-5, random_state=42) 
reg.fit(df,df_y)
pred = reg.predict(df)
print("MSE=",mean_squared_error(df_y, pred))

MSE= 357390845.83744156


In [None]:
#Neural network with Standardscalar

In [21]:
scaler = StandardScaler()
scaler.fit(df)
df_trans=scaler.transform(df)
reg = MLPRegressor(hidden_layer_sizes=(50,600,300,),activation='relu', solver='adam', alpha=1e-5, random_state=42) 
reg.fit(df_trans,df_y)
pred = reg.predict(df_trans)
print("MSE=",mean_squared_error(df_y, pred))

MSE= 9277604.124366865




In [None]:
ols_regression(df, df_y)