# Question 14

In [11]:
import json
import datetime,time
import pytz
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from statistics import mean
import time

In [12]:
train_path = "../../projects/project_5/ECE219_tweet_data/"

train_files = ["tweets_#gohawks.txt", "tweets_#gopatriots.txt", \
        "tweets_#nfl.txt", "tweets_#patriots.txt", \
        "tweets_#sb49.txt", "tweets_#superbowl.txt"]
topics = ["gohawks", "gopatriots", "nfl", "patriots", "sb49", "superbowl"]

In [13]:
test_path = "../../projects/project_5/ECE219_tweet_test/"

test_files = [["sample0_period1.txt", "sample0_period2.txt", "sample0_period3.txt"], \
              ["sample1_period1.txt", "sample1_period2.txt", "sample1_period3.txt"], \
              ["sample2_period1.txt", "sample2_period2.txt", "sample2_period3.txt"]]

#### Prepare for data

In [14]:
def transfer_time(data_raw,time_type):
    """
    Given UNIX time stored in data_raw["time"], return either hour index or minute index
    and store in "time" depending on the period require.
    """
    pst_tz = pytz.timezone('America/Los_Angeles')
    
    # sort according to time
    pddata_raw = pd.DataFrame(data_raw,columns=['time','tweets','retweets','followers','mentioned',\
                                                'media','active','author','favourites_count','title'])
    pddata_raw = pddata_raw.sort_values(by = 'time')
    pddata_raw = pddata_raw.reset_index(drop=True)               

    # calculate hour index and minute index from time
    if time_type == 'hour':
        hour_accu = []
        for index, row in pddata_raw.iterrows():  
            p = datetime.datetime.fromtimestamp(row['time'], pst_tz)  
            hour_accu.append(((p.month-1)*31+p.day-14)*24+p.hour)                             
        pddata_raw['time'] = hour_accu
    elif time_type == 'minute':
        minu_accu = []
        for index, row in pddata_raw.iterrows():  
            p = datetime.datetime.fromtimestamp(row['time'], pst_tz)                    
            minu_accu.append((((p.month-1)*31+p.day-14)*24 + (p.hour-0))*12 + p.minute//5)             
        pddata_raw['time'] = minu_accu    
    else:
        print("Invalid time type")
        
    return pddata_raw

In [35]:
def six_times_window(df,df_y):
    m=df.shape[0]
    df_new= pd.DataFrame(np.zeros((m-4,10)),columns=['tweets','retweets','followers sum','followers max',\
                                  'mentioned','media','active','author','favourites_count','title'])
    df_y_new= pd.DataFrame(np.zeros((m-4)),columns=['num'])
    for i in range(m-4):
        
        for j in range(9):
            df_new.iloc[i,j]=df.iloc[i,j]+df.iloc[i+1,j]+df.iloc[i+2,j]+df.iloc[i+3,j]+df.iloc[i+4,j]
        j=3
        df_new.iloc[i,3]=np.max([df.iloc[i,j],df.iloc[i+1,j],df.iloc[i+2,j],df.iloc[i+3,j],df.iloc[i+4,j]])
        j=6
        df_new.iloc[i,j]=(df.iloc[i,j]*df.iloc[i,7]+df.iloc[i+1,j]*df.iloc[i+1,7]+df.iloc[i+2,j]*df.iloc[i+2,7]+df.iloc[i+3,j]*df.iloc[i+3,7]+df.iloc[i+4,j]*df.iloc[i+4,7])/(df.iloc[i,7]+df.iloc[i+1,7]+df.iloc[i+2,7]+df.iloc[i+3,7]+df.iloc[i+4,7])
        j=9
        df_new.iloc[i,j]=(df.iloc[i,j]*df.iloc[i,0]+df.iloc[i+1,j]*df.iloc[i+1,0]+df.iloc[i+2,j]*df.iloc[i+2,0]+df.iloc[i+3,j]*df.iloc[i+3,0]+df.iloc[i+4,j]*df.iloc[i+4,0])/(df.iloc[i,0]+df.iloc[i+1,0]+df.iloc[i+2,0]+df.iloc[i+3,0]+df.iloc[i+4,0])
        df_y_new.iloc[i,0]=df_y.iloc[i+4]
    
    return df_new, df_y_new

In [16]:
def generate_df(pddata_raw):
    """
    Create a new dataframe with desired form
    """
    df = pd.DataFrame([],columns=['time unit','tweets','retweets','followers sum','followers max',\
                                  'mentioned','media','active','author','favourites_count','title'])
    
    col = pddata_raw.columns.get_loc('time')
    df['time unit'] = range(int(pddata_raw.iloc[len(pddata_raw.index)-1,col] - pddata_raw.iloc[0,col]+1))

    df['tweets'] = pddata_raw.groupby("time")['tweets'].sum()
    df['retweets'] = pddata_raw.groupby("time")['retweets'].sum()
    df['followers sum'] = pddata_raw.groupby("time")['followers'].sum()
    df['followers max'] = pddata_raw.groupby("time")["followers"].max()
    df['mentioned'] = pddata_raw.groupby("time")['mentioned'].sum()
    df['media'] = pddata_raw.groupby("time")['media'].sum()
    df['active'] = pddata_raw.groupby("time")['active'].mean()  
    df['author'] = pddata_raw.groupby("time")['author'].nunique() # count number of not-repeating authors    
    df['favourites_count'] = pddata_raw.groupby("time")['favourites_count'].sum()
    df['title'] = pddata_raw.groupby("time")['title'].mean()
            
    # reset index of df
    df = df.fillna(0).reset_index(drop=True)
    
    return df

In [17]:
def parse_dataset(path,files):
    """
    Parse x conponent of the dataset into pandas DataFrame including columns of:
    tweets, retweets, total_followers, max_followers, mentioned, media, active, author, favourites_count, title
    with lines of hours,
    where mentioned: number of @ in tweets per hour
          media: number of url in tweets per hour
          active: a measure of active state of author
          author: number of unique authors post tweet per hour
          favourites_count: the total number of likes by this user
          title: length of this tweet's title
    Parse y of dataset as number of tweets in the next hour.
    """
    start_time = time.mktime(time.strptime("2015-02-01 08:00:00",'%Y-%m-%d %H:%M:%S'))
    end_time = time.mktime(time.strptime("2015-02-01 20:00:00",'%Y-%m-%d %H:%M:%S'))   

    start_hour_idx = ((2-1)*31+1-14)*24+8
    end_hour_idx = ((2-1)*31+1-14)*24+20
    start_minute_idx = (((2-1)*31+1-14)*24 + (8-0))*12 + 0//5    
        
    # extract raw features
    data_raw = [[],[],[]]
    for file in files:
        for line in open(path + file, 'r') :
            row_tmp = []
            a = json.loads(line)
            citation_date = a['citation_date']
            tweet = 1
            retweet = a['metrics']['citations']['total']
            foll = a['author']['followers']             
            ment = len(a['tweet']['entities']['user_mentions'])        
            medi = len(a['tweet']['extended_entities']['media']) if 'extended_entities' in a['tweet'] else 0
            hist_tw = a['tweet']['user']["statuses_count"]
            hist_yr = a['tweet']['user']['created_at'][-4:]
            acti = hist_tw/(2015-float(hist_yr)+1) 
            auth = a['author']['name']
            favo = a['tweet']['user']['favourites_count']
            titl = len(a['title'])

            # append to row_tmp
            row_tmp.append(citation_date)        
            row_tmp.append(tweet)        
            row_tmp.append(retweet)
            row_tmp.append(foll)    
            row_tmp.append(ment) 
            row_tmp.append(medi) 
            row_tmp.append(acti)  
            row_tmp.append(auth)
            row_tmp.append(favo)
            row_tmp.append(titl)

            # assign to 3 periods
            if citation_date < start_time:
                data_raw[0].append(row_tmp)
            elif citation_date < end_time:
                data_raw[1].append(row_tmp)
            else:
                data_raw[2].append(row_tmp)            

    # generate raw pandas dataframe
    pddata_raw_1 = transfer_time(data_raw[0],'hour')
    pddata_raw_1['time'] = pddata_raw_1['time'] - pddata_raw_1.loc[0,'time']
    
    pddata_raw_2 = transfer_time(data_raw[1],'minute')
#     pddata_raw_2['time'] = pddata_raw_2['time'] - start_minute_idx
    pddata_raw_2['time'] = pddata_raw_2['time'] - pddata_raw_2.loc[0,'time']

    pddata_raw_3 = transfer_time(data_raw[2],'hour')
#     pddata_raw_3['time'] = pddata_raw_3['time'] - end_hour_idx - 1    
    pddata_raw_3['time'] = pddata_raw_3['time'] - pddata_raw_3.loc[0,'time']
    print('1',pddata_raw_1)
    
    # generate df and df_y for each time slot
    df_1 = generate_df(pddata_raw_1)  
    df_y_1 = df_1.iloc[1:,1].reset_index(drop=True)
    df_1 = df_1[:len(df_y_1)]
    print('2',df_1)
    
    df_2 = generate_df(pddata_raw_2)
    df_y_2 = df_2.iloc[1:,1].reset_index(drop=True)
    df_2 = df_2[:len(df_y_2)]
   
    df_3 = generate_df(pddata_raw_3)
    df_y_3 = df_3.iloc[1:,1].reset_index(drop=True)
    df_3 = df_3[:len(df_y_3)]
    
    return (df_1.iloc[:,1:],df_y_1), (df_2.iloc[:,1:],df_y_2), (df_3.iloc[:,1:],df_y_3)

#### Train model with aggregated data, test model with test files

In [18]:
def train_lr(train_path, train_files):
    """
    This function gets train data and train using a linear regression model
    """    
    (df_1,df_y_1),(df_2,df_y_2),(df_3,df_y_3) = parse_dataset(train_path, train_files)

    df_1, df_y_1  =six_times_window(df_1,df_y_1)
    df_2, df_y_2  =six_times_window(df_2,df_y_2)
    df_3, df_y_3  =six_times_window(df_3,df_y_3)

    reg_1 = LinearRegression().fit(df_1, df_y_1)
    reg_2 = LinearRegression().fit(df_2, df_y_2)
    reg_3 = LinearRegression().fit(df_3, df_y_3)
    
    return (reg_1, reg_2, reg_3)

In [19]:
def test_lr(test_path, test_file, regs):
    """
    This function gets test data and test with a linear regression model
    """    
    reg_1, reg_2, reg_3 = regs
    
    (df_1,df_y_1),(df_2,df_y_2),(df_3,df_y_3) = parse_dataset(test_path, test_file)
    print('3 ',df_1)
    
    df_1, df_y_1  =six_times_window(df_1,df_y_1)
    df_2, df_y_2  =six_times_window(df_2,df_y_2)
    df_3, df_y_3  =six_times_window(df_3,df_y_3)
    
    pred_y_1 = reg_1.predict(df_1)     
    real_y_1 = df_y_1.iloc[-1]
    print("time period 1 true tweet: ",real_y_1, " predicted tweet: ",pred_y_1)

    pred_y_2 = reg_2.predict(df_2)     
    real_y_2 = df_y_2.iloc[-1]
    print("time period 2 true tweet: ",real_y_2, " predicted tweet: ",pred_y_2)

    pred_y_3 = reg_3.predict(df_3)     
    real_y_3 = df_y_3.iloc[-1]
    print("time period 3 true tweet: ",real_y_3, " predicted tweet: ",pred_y_3)

## Results

In [38]:
df_y_1

0    180
1    202
2    294
3    555
4    846
Name: tweets, dtype: int64

In [39]:
df_1

Unnamed: 0,tweets,retweets,followers sum,followers max,mentioned,media,active,author,favourites_count,title
0,203,526,509311.0,104627.0,131,47,7574.478421,188,416661,101.295567
1,180,269,1676912.0,959176.0,69,33,9592.529702,169,336201,101.105556
2,202,382,3434565.0,1813823.0,88,52,7200.795914,196,389427,99.059406
3,294,549,2182170.0,372800.0,119,64,3730.383688,286,495565,97.564626
4,555,906,4242856.0,945273.0,241,118,5234.146475,514,1004865,96.052252


In [40]:
df_1, df_y_1  =six_times_window(df_1,df_y_1)
df_1

Unnamed: 0,tweets,retweets,followers sum,followers max,mentioned,media,active,author,favourites_count,title
0,1434.0,2632.0,12045814.0,1813823.0,648.0,314.0,6070.758673,1353.0,2642719.0,98.162483


In [37]:
(df_1,df_y_1),(df_2,df_y_2),(df_3,df_y_3) = parse_dataset(test_path, test_files[1])

1       time  tweets  retweets  followers  mentioned  media         active  \
0        0       1         1      387.0          0      0     145.000000   
1        0       1         1       96.0          0      1    7488.000000   
2        0       1         1     1768.0          0      0     924.000000   
3        0       1         1        0.0          0      0    1158.000000   
4        0       1         2    54469.0          1      1   14011.833333   
5        0       1         1      359.0          1      0     511.750000   
6        0       1         1      751.0          0      0     122.600000   
7        0       1         1        0.0          0      0    1159.000000   
8        0       1         1      572.0          0      0      78.333333   
9        0       1         1     4064.0          0      0    6412.000000   
10       0       1         1        9.0          0      0      13.000000   
11       0       1         1     3010.0          0      0    1691.000000   
12       0

In [41]:
regs = train_lr(train_path, train_files)
for file in test_files:
    test_lr(test_path, file, regs)

1         time  tweets  retweets  followers  mentioned  media        active  \
0          0       1         1      145.0          0      0  39730.500000   
1          0       1         1     3050.0          0      0  15845.800000   
2          0       1         3     3457.0          0      0   3888.200000   
3          0       1         2    10658.0          0      0  44001.666667   
4          0       1         1      570.0          3      0   5590.285714   
5          0       1         1      833.0          0      1  64033.333333   
6          0       1         2       85.0          2      1     40.500000   
7          0       1         3      120.0          0      1    515.750000   
8          0       1         1      331.0          0      0  34966.500000   
9          0       1         1      166.0          0      0  21115.500000   
10         0       1         1      785.0          0      0  12732.500000   
11         0       1         1     2794.0          0      0  17870.750000 

2      time unit  tweets  retweets  followers sum  followers max  mentioned  \
0            0     111       714       223405.0        41818.0         45   
1            1      89       663       187317.0        19558.0         25   
2            2     110       640      1657090.0      1362401.0         30   
3            3     100       267       165256.0        19558.0         25   
4            4     137       468       444858.0        65150.0         77   
5            5     169       346       683834.0       104412.0         58   
6            6     215       580       993090.0       105750.0        120   
7            7     353      1054      1694989.0       174922.0        148   
8            8     569      3224      5685647.0      1300862.0        273   
9            9     533      1982      1838262.0       358231.0        253   
10          10     529      1127      4111731.0       811823.0        243   
11          11     545      1928      3703322.0      1608967.0        256 

1      time  tweets  retweets  followers  mentioned  media         active  \
0       0       1         1    98191.0          0      1    2715.428571   
1       0       1         1      158.0          0      0    2385.250000   
2       0       1         1      370.0          0      2     302.400000   
3       0       1         2     6549.0          3      0    1987.142857   
4       0       1         1      187.0          1      0    1324.000000   
5       0       1         1     1352.0          0      0    2804.500000   
6       0       1         1    18722.0          0      0     633.666667   
7       0       1         2     1427.0          1      0    2031.428571   
8       0       1         1      617.0          0      1     229.000000   
9       0       1         3      213.0          1      1     608.200000   
10      0       1         2      213.0          1      1     608.200000   
11      0       1         1     3331.0          1      0    3434.125000   
12      0       1      

1       time  tweets  retweets  followers  mentioned  media         active  \
0        0       1         1      387.0          0      0     145.000000   
1        0       1         1       96.0          0      1    7488.000000   
2        0       1         1     1768.0          0      0     924.000000   
3        0       1         1        0.0          0      0    1158.000000   
4        0       1         2    54469.0          1      1   14011.833333   
5        0       1         1      359.0          1      0     511.750000   
6        0       1         1      751.0          0      0     122.600000   
7        0       1         1        0.0          0      0    1159.000000   
8        0       1         1      572.0          0      0      78.333333   
9        0       1         1     4064.0          0      0    6412.000000   
10       0       1         1        9.0          0      0      13.000000   
11       0       1         1     3010.0          0      0    1691.000000   
12       0

1      time  tweets  retweets  followers  mentioned  media         active  \
0       0       1         1      160.0          0      0     126.000000   
1       0       1         1      135.0          0      0     404.750000   
2       0       1         1      547.0          1      0     335.500000   
3       0       1         1      123.0          1      0     168.750000   
4       0       1         1       15.0          0      0      14.000000   
5       0       1         1     1400.0          0      0    8654.000000   
6       0       1         1      249.0          0      0     704.000000   
7       0       1         1        2.0          0      0       4.000000   
8       0       1         1      175.0          0      0     470.714286   
9       0       1         1      381.0          2      0    2865.000000   
10      0       1         1       33.0          0      0      72.000000   
11      0       1         1        2.0          0      0       7.500000   
12      0       1      