In [1]:
import json
import datetime,time
import pytz
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from statistics import mean
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from statistics import mean
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import make_regression
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler  
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor

In [2]:
path = "tweet_data/"

files = ["tweets_#gohawks.txt", "tweets_#gopatriots.txt", \
        "tweets_#nfl.txt", "tweets_#patriots.txt", \
        "tweets_#sb49.txt", "tweets_#superbowl.txt"]
topics = ["gohawks", "gopatriots", "nfl", "patriots", "sb49", "superbowl"]

for i, fl in enumerate(files):
    print("files[" + str(i) + "] => " + fl)

files[0] => tweets_#gohawks.txt
files[1] => tweets_#gopatriots.txt
files[2] => tweets_#nfl.txt
files[3] => tweets_#patriots.txt
files[4] => tweets_#sb49.txt
files[5] => tweets_#superbowl.txt


In [3]:
def generate_df(pddata_raw):
    """
    Create a new dataframe with desired form
    """
    df = pd.DataFrame([],columns=['time unit','tweets','retweets','followers sum','followers max','hr_min'])
    
    col = pddata_raw.columns.get_loc('time')
    df['time unit'] = range(int(pddata_raw.iloc[len(pddata_raw.index)-1,col] - pddata_raw.iloc[0,col]+1))
    df['tweets'] = pddata_raw.groupby("time")['tweets'].sum()
    df['retweets'] = pddata_raw.groupby("time")['retweets'].sum()
    df['followers sum'] = pddata_raw.groupby("time")['followers'].sum()
    df['followers max'] = pddata_raw.groupby("time")["followers"].max()
    df['hr_min'] = pddata_raw.groupby("time")['hr_min'].mean()        
    # reset index of df
    df = df.fillna(0).reset_index(drop=True)
    
    return df

In [4]:

def parse_dataset(path,files):
    """
    Parse x conponent of the dataset into pandas DataFrame including columns of:
    tweets, retweets, total_followers, max_followers, mentioned, media, active, author, favourites_count, title
    with lines of hours,
    where mentioned: number of @ in tweets per hour
          media: number of url in tweets per hour
          active: a measure of active state of author
          author: number of unique authors post tweet per hour
          favourites_count: the total number of likes by this user
          title: length of this tweet's title
    Parse y of dataset as number of tweets in the next hour.
    """
    start_time = time.mktime(time.strptime("2015-02-01 08:00:00",'%Y-%m-%d %H:%M:%S'))
    end_time = time.mktime(time.strptime("2015-02-01 20:00:00",'%Y-%m-%d %H:%M:%S'))   

    start_hour_idx = ((2-1)*31+1-14)*24+8
    end_hour_idx = ((2-1)*31+1-14)*24+20
    start_minute_idx = (((2-1)*31+1-14)*24 + (8-0))*12 + 0//5    
        
    # extract raw features
    data_raw = [[],[],[]]
    for file in files:
        for line in open(path + file, 'r', encoding="utf-8") :
            row_tmp = []
            a = json.loads(line)
            citation_date = a['citation_date']
            tweet = 1
            retweet = a['metrics']['citations']['total']
            foll = a['author']['followers']
            hr_min=1

            # append to row_tmp
            row_tmp.append(citation_date)        
            row_tmp.append(tweet)        
            row_tmp.append(retweet)
            row_tmp.append(foll)  
            row_tmp.append(hr_min) 
            # assign to 3 periods
            if citation_date < start_time:
                data_raw[0].append(row_tmp)
            elif citation_date < end_time:
                data_raw[1].append(row_tmp)
            else:
                data_raw[2].append(row_tmp)            

    # generate raw pandas dataframe
    pddata_raw_1 = transfer_time(data_raw[0],'hour')
    pddata_raw_1['time'] = pddata_raw_1['time'] - pddata_raw_1.loc[0,'time']
    
    pddata_raw_2 = transfer_time(data_raw[1],'minute')
#     pddata_raw_2['time'] = pddata_raw_2['time'] - start_minute_idx
    pddata_raw_2['time'] = pddata_raw_2['time'] - pddata_raw_2.loc[0,'time']

    pddata_raw_3 = transfer_time(data_raw[2],'hour')
#     pddata_raw_3['time'] = pddata_raw_3['time'] - end_hour_idx - 1    
    pddata_raw_3['time'] = pddata_raw_3['time'] - pddata_raw_3.loc[0,'time']
    #print('1',pddata_raw_1)
    
    # generate df and df_y for each time slot
    df_1 = generate_df(pddata_raw_1)  
    df_y_1 = df_1.iloc[1:,1].reset_index(drop=True)
    df_1 = df_1[:len(df_y_1)]
    #print('2',df_1)
    
    df_2 = generate_df(pddata_raw_2)
    df_y_2 = df_2.iloc[1:,1].reset_index(drop=True)
    df_2 = df_2[:len(df_y_2)]
   
    df_3 = generate_df(pddata_raw_3)
    df_y_3 = df_3.iloc[1:,1].reset_index(drop=True)
    df_3 = df_3[:len(df_y_3)]
    
    return (df_1.iloc[:,1:],df_y_1), (df_2.iloc[:,1:],df_y_2), (df_3.iloc[:,1:],df_y_3)

In [5]:
def transfer_time(data_raw,time_type):
    
    pst_tz = pytz.timezone('America/Los_Angeles')
    
    # sort according to time
    pddata_raw = pd.DataFrame(data_raw,columns=['time','tweets','retweets','followers','hr_min'])
    pddata_raw = pddata_raw.sort_values(by = 'time')
    pddata_raw = pddata_raw.reset_index(drop=True)               

    # calculate hour index and minute index from time
    if time_type == 'hour':
        hour_accu = []
        hour_min = []
        for index, row in pddata_raw.iterrows():  
            p = datetime.datetime.fromtimestamp(row['time'], pst_tz)  
            hour_accu.append(((p.month-1)*31+p.day-14)*24+p.hour)  
            hour_min.append(p.hour)
        pddata_raw['time'] = hour_accu
        pddata_raw['hr_min'] = hour_min
    elif time_type == 'minute':
        minu_accu = []
        hour_min = []
        for index, row in pddata_raw.iterrows():  
            p = datetime.datetime.fromtimestamp(row['time'], pst_tz)                    
            minu_accu.append((((p.month-1)*31+p.day-14)*24 + (p.hour-0))*12 + p.minute//5)   
            hour_min.append(p.minute//5)
        pddata_raw['time'] = minu_accu    
        pddata_raw['hr_min'] = hour_min
    else:
        print("Invalid time type")
        
    return pddata_raw

In [6]:
def six_times_window(df,df_y):
    m=df.shape[0]
    df_new= pd.DataFrame(np.zeros((m-4,5)),columns=['tweets','retweets','followers sum','followers max',\
                                  'hr_min'])
    df_y_new= pd.DataFrame(np.zeros((m-4)))
    for i in range(m-4):
        
        for j in range(5):
            df_new.iloc[i,j]=(df.iloc[i,j]+df.iloc[i+1,j]+df.iloc[i+2,j]+df.iloc[i+3,j]+df.iloc[i+4,j])
        j=3
        df_new.iloc[i,3]=np.max([df.iloc[i,j],df.iloc[i+1,j],df.iloc[i+2,j],df.iloc[i+3,j],df.iloc[i+4,j]])
        df_y_new.iloc[i,0]=df_y.iloc[i+4]
    
    return df_new, df_y_new    

In [7]:
def plot_recipe(df_y, pred_y):
    """
    This function plots fitted values vs true values
    """
    plt.figure()
    area = np.pi * (4)**2/4
    plt.scatter(df_y, pred_y, s = area)
    plt.plot([df_y.min(), df_y.max()], [df_y.min(), df_y.max()], 'k--', lw = 1)
    plt.xlabel('true values')
    plt.ylabel('fitted values')
    plt.show()

In [8]:
(dfo,dfo_y),(dfo_2,dfo_y_2),(dfo_3,dfo_y_3) =parse_dataset(path,files)

In [9]:
df, df_y  =six_times_window(dfo,dfo_y)
df_2, df_y_2  =six_times_window(dfo_2,dfo_y_2)
df_3, df_y_3  =six_times_window(dfo_3,dfo_y_3)

In [10]:
test_files_1 =["sample0_period1.txt","sample0_period2.txt","sample0_period3.txt"]
test_files_2 =["sample1_period1.txt","sample1_period2.txt","sample1_period3.txt"]
test_files_3 =["sample2_period1.txt","sample2_period2.txt","sample2_period3.txt"]

In [11]:
(df_test1,df_y_test1),(df_2_test1,df_y_2_test1),(df_3_test1,df_y_3_test1) = parse_dataset(path,test_files_1)
(df_test2,df_y_test2),(df_2_test2,df_y_2_test2),(df_3_test2,df_y_3_test2) = parse_dataset(path,test_files_2)
(df_test3,df_y_test3),(df_2_test3,df_y_2_test3),(df_3_test3,df_y_3_test3) = parse_dataset(path,test_files_3)

In [23]:
df_test1

Unnamed: 0,tweets,retweets,followers sum,followers max,hr_min
0,52,109,424498.0,168371.0,5
1,79,761,2975692.0,2034387.0,6
2,94,226,860594.0,328882.0,7
3,101,258,2349147.0,368626.0,8
4,122,483,1369748.0,291130.0,9


In [12]:
df_test1, df_y_test1  =six_times_window(df_test1,df_y_test1)
df_2_test1, df_y_2_test1  =six_times_window(df_2_test1,df_y_2_test1)
df_3_test1, df_y_3_test1  =six_times_window(df_3_test1,df_y_3_test1)
df_test2, df_y_test2  =six_times_window(df_test2,df_y_test2)
df_2_test2, df_y_2_test2  =six_times_window(df_2_test2,df_y_2_test2)
df_3_test2, df_y_3_test2  =six_times_window(df_3_test2,df_y_3_test2)
df_test3, df_y_test3  =six_times_window(df_test3,df_y_test3)
df_2_test3, df_y_2_test3  =six_times_window(df_2_test3,df_y_2_test3)
df_3_test3, df_y_3_test3  =six_times_window(df_3_test3,df_y_3_test3)

# RandomForest

In [14]:
reg = RandomForestRegressor(oob_score=True,n_estimators=200,max_depth=20,min_samples_leaf=2,min_samples_split=10, bootstrap=True,max_features='sqrt',random_state=42)
reg.fit(df,df_y)
pred = reg.predict(df)
pred1=reg.predict(df_test1)
pred2=reg.predict(df_test2)
pred3=reg.predict(df_test3)
print(df_y_test1,df_y_test2,df_y_test3)
print(pred1,pred2,pred3)

       0
0  120.0        0
0  846.0       0
0  61.0
[340.77519001] [638.33634891] [202.79927484]


  


In [15]:
reg = RandomForestRegressor(oob_score=True,n_estimators=1200,max_depth=20,min_samples_leaf=4,min_samples_split=2, bootstrap=True,max_features='sqrt',random_state=42)
reg.fit(df_2,df_y_2)
pred = reg.predict(df_2)
pred1=reg.predict(df_2_test1)
pred2=reg.predict(df_2_test2)
pred3=reg.predict(df_2_test3)
print(df_y_2_test1,df_y_2_test2,df_y_2_test3)
print(pred1,pred2,pred3)

  


        0
0  1123.0        0
0  903.0       0
0  28.0
[5062.22864342] [1085.21591104] [1042.27265207]


In [16]:
reg = RandomForestRegressor(oob_score=True,n_estimators=200,max_depth=20,min_samples_leaf=4,min_samples_split=2, bootstrap=True,max_features='sqrt',random_state=42)
reg.fit(df_3,df_y_3)
pred = reg.predict(df_3)
pred1=reg.predict(df_3_test1)
pred2=reg.predict(df_3_test2)
pred3=reg.predict(df_3_test3)
print(df_y_3_test1,df_y_3_test2,df_y_3_test3)
print(pred1,pred2,pred3)

      0
0  87.0       0
0  46.0       0
0  43.0
[103.95607076] [199.11456449] [203.31185616]


  


# GradientBoostingRegressor

In [17]:
reg = GradientBoostingRegressor(n_estimators=200,max_depth=20,min_samples_leaf=2,min_samples_split=10, max_features='sqrt',random_state=42)
reg.fit(df,df_y)
pred = reg.predict(df)
pred1=reg.predict(df_test1)
pred2=reg.predict(df_test2)
pred3=reg.predict(df_test3)
print(df_y_test1,df_y_test2,df_y_test3)
print(pred1,pred2,pred3)

  y = column_or_1d(y, warn=True)


       0
0  120.0        0
0  846.0       0
0  61.0
[298.04358475] [573.50261936] [177.73571437]


In [18]:
reg = GradientBoostingRegressor(n_estimators=1200,max_depth=20,min_samples_leaf=4,min_samples_split=2, max_features='sqrt',random_state=42)
reg.fit(df_2,df_y_2)
pred = reg.predict(df_2)
pred1=reg.predict(df_2_test1)
pred2=reg.predict(df_2_test2)
pred3=reg.predict(df_2_test3)
print(df_y_2_test1,df_y_2_test2,df_y_2_test3)
print(pred1,pred2,pred3)

  y = column_or_1d(y, warn=True)


        0
0  1123.0        0
0  903.0       0
0  28.0
[6816.75807019] [1016.80809268] [965.41975729]


In [19]:
reg = GradientBoostingRegressor(n_estimators=200,max_depth=20,min_samples_leaf=4,min_samples_split=2, max_features='sqrt',random_state=42)
reg.fit(df_3,df_y_3)
pred = reg.predict(df_3)
pred1=reg.predict(df_3_test1)
pred2=reg.predict(df_3_test2)
pred3=reg.predict(df_3_test3)
print(df_y_3_test1,df_y_3_test2,df_y_3_test3)
print(pred1,pred2,pred3)

      0
0  87.0       0
0  46.0       0
0  43.0
[30.03838049] [11.88626886] [30.80890984]


  y = column_or_1d(y, warn=True)


# Neural Network

In [20]:
scaler = StandardScaler()
scaler.fit(df)
df_trans=scaler.transform(df)
reg = MLPRegressor(hidden_layer_sizes=(50,600,300,),activation='relu', solver='adam', alpha=1e-5, random_state=42) 
reg.fit(df_trans,df_y)
pred = reg.predict(df_trans)
print("MSE=",mean_squared_error(df_y, pred))
pred1=reg.predict(scaler.transform(df_test1))
pred2=reg.predict(scaler.transform(df_test2))
pred3=reg.predict(scaler.transform(df_test3))
print(df_y_test1,df_y_test2,df_y_test3)
print(pred1,pred2,pred3)

  y = column_or_1d(y, warn=True)


MSE= 3988798.5055075316
       0
0  120.0        0
0  846.0       0
0  61.0
[138.64449719] [597.74290231] [294.78384788]




In [21]:
scaler = StandardScaler()
scaler.fit(df_2)
df_2_trans=scaler.transform(df_2)
reg = MLPRegressor(hidden_layer_sizes=(600,600,600,),activation='relu', solver='adam', alpha=1e-5, random_state=42) 
reg.fit(df_2_trans,df_y_2)
pred = reg.predict(df_2_trans)
print("MSE=",mean_squared_error(df_y_2, pred))
pred1=reg.predict(scaler.transform(df_2_test1))
pred2=reg.predict(scaler.transform(df_2_test2))
pred3=reg.predict(scaler.transform(df_2_test3))
print(df_y_2_test1,df_y_2_test2,df_y_2_test3)
print(pred1,pred2,pred3)

  y = column_or_1d(y, warn=True)


MSE= 33264948.38992299
        0
0  1123.0        0
0  903.0       0
0  28.0
[6774.20236825] [6051.60427666] [6946.44812688]




In [23]:
scaler = StandardScaler()
scaler.fit(df_3)
df_3_trans=scaler.transform(df_3)
reg = MLPRegressor(hidden_layer_sizes=(600,600,600,),activation='relu', solver='adam', alpha=1e-5, random_state=42) 
reg.fit(df_3_trans,df_y_3)
pred = reg.predict(df_3_trans)
print("MSE=",mean_squared_error(df_y_3, pred))
pred1=reg.predict(scaler.transform(df_3_test1))
pred2=reg.predict(scaler.transform(df_3_test2))
pred3=reg.predict(scaler.transform(df_3_test3))
print(df_y_3_test1,df_y_3_test2,df_y_3_test3)
print(pred1,pred2,pred3)

  y = column_or_1d(y, warn=True)


MSE= 183911.8928080028
      0
0  87.0       0
0  46.0       0
0  43.0
[574.67856497] [362.31850104] [402.85187849]


