# **Internet Article Traffic Prediction**

## Import Packages

In [38]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import jieba
from datetime import datetime
from textblob import TextBlob
import emoji
import re
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
import tensorflow as tf
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

## Read Data

In [39]:
train = pd.read_csv('./data/train_dataset.csv')
test = pd.read_csv('./data/test_dataset.csv')

## Feature Extraction

1. 標題hashtag分類  

In [86]:
def preprocessor(text):   # Remove the number in the titles and return the word after hashtag
    number = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
    for i in number:
        text = text.replace(i, '')
    out = ''
    hash_index = text.find('#')
    if hash_index != -1: 
        space_index = text.find(' ', hash_index)
        if space_index != -1: 
            out = text[hash_index+1:space_index]
        else: 
            out = text[hash_index+1:]
    if len(out) > 5: out = ''   # Only take the hashtag name that is shorter or equal to five chars
    return out

def text_count_vectorizer(data):   # Return the count vectorizer model based on the input data
    transfer = CountVectorizer(preprocessor=preprocessor)
    transfer.fit(data)
    return transfer

bag = train['title']
vectorizer = text_count_vectorizer(bag)

test_case_1 = '#上榜心得 123'
test_case_1_vectorized = vectorizer.transform([test_case_1])

def title_into_hash(title, vectorizer):   # Use the vectorizer to transorm the titles into vectors and turn the vectors into hash values
    vec = vectorizer.transform([title]).toarray()[0]
    return hash(tuple(vec))
train['title_hash'] = train['title'].map(lambda x : title_into_hash(x, vectorizer))
test['title_hash'] = test['title'].map(lambda x : title_into_hash(x, vectorizer))

# Debug messages
# print("The first 50 hashtag name:\n", vectorizer.get_feature_names_out()[:50], '\n')
# print('Result of title vectorization of test case 1 \'#上榜心得 123\':\n', test_case_1_vectorized, '\n')
# print('Vector size after title vectorization:\n', test_case_1_vectorized.shape, '\n')
# print('The first 5 hash values of training set:\n',train['title_hash'][:5], '\n')
# print('The first 5 hash values of testing set:\n', test['title_hash'][:5])

2. 星期幾以及時間區段

In [41]:
def weekday(time):   # Return 0 if Monday, 1 if Tuesday, ..., 6 if Sunday
    datetime_object = datetime.strptime(time[:-4], '%Y-%m-%d %H:%M:%S')
    return datetime_object.weekday()

def time_interval(time):   # Cut one day into 24 time interval, return which time interval the input time is in
    datetime_object = datetime.strptime(time[:-4], '%Y-%m-%d %H:%M:%S')
    return int(datetime_object.hour)

train['weekday'] = train['created_at'].map(lambda x : weekday(x))
test['weekday'] = test['created_at'].map(lambda x : weekday(x))
train['time_interval'] = train['created_at'].map(lambda x : time_interval(x))
test['time_interval'] = test['created_at'].map(lambda x : time_interval(x))

3. 標題長度

In [42]:
def len_of_text(text):
    return len(text)

train['title_len'] = train['title'].map(lambda x : len_of_text(x))
test['title_len'] = test['title'].map(lambda x : len_of_text(x))

4. 標題情感

In [43]:
def sub(title):   # Return the subjectivity of the title
    sent = TextBlob(title)
    subjectivity  = sent.subjectivity
    return subjectivity

def pol(title):   # Return the polarity of the title
    sent = TextBlob(title)
    polarity  = sent.polarity
    return polarity

train['sub'] = train['title'].map(lambda x : sub(x))
test['sub'] = test['title'].map(lambda x : sub(x))
train['pol'] = train['title'].map(lambda x : pol(x))
test['pol'] = test['title'].map(lambda x : pol(x))

5. 標題是否包含emoji

In [44]:
def has_emoji(text):
    if any(char in emoji.EMOJI_DATA for char in text): return 1
    else: return 0

train['title_has_emoji'] = train['title'].map(lambda x : has_emoji(x))
test['title_has_emoji'] = test['title'].map(lambda x : has_emoji(x))

6. 標題是否包含數字

In [45]:
def has_number(text):
    has_digit_number = re.search(r'\d', text)
    if has_digit_number: return 1
    else: return 0

train['title_has_number'] = train['title'].map(lambda x : has_number(x))
test['title_has_number'] = test['title'].map(lambda x : has_number(x))

## Feature Interaction

1. 按讚數與留言數：後續每一小時對於第一小時的斜率

In [46]:
train['like_diff_1'] = (train['like_count_2h'] - train['like_count_1h'])/1
train['like_diff_2'] = (train['like_count_3h'] - train['like_count_1h'])/2
train['like_diff_3'] = (train['like_count_4h'] - train['like_count_1h'])/3
train['like_diff_4'] = (train['like_count_5h'] - train['like_count_1h'])/4
train['like_diff_5'] = (train['like_count_6h'] - train['like_count_1h'])/5
test['like_diff_1'] = (test['like_count_2h'] - test['like_count_1h'])/1
test['like_diff_2'] = (test['like_count_3h'] - test['like_count_1h'])/2
test['like_diff_3'] = (test['like_count_4h'] - test['like_count_1h'])/3
test['like_diff_4'] = (test['like_count_5h'] - test['like_count_1h'])/4
test['like_diff_5'] = (test['like_count_6h'] - test['like_count_1h'])/5
train['comment_diff_1'] = (train['comment_count_2h'] - train['comment_count_1h'])/1
train['comment_diff_2'] = (train['comment_count_3h'] - train['comment_count_1h'])/2
train['comment_diff_3'] = (train['comment_count_4h'] - train['comment_count_1h'])/3
train['comment_diff_4'] = (train['comment_count_5h'] - train['comment_count_1h'])/4
train['comment_diff_5'] = (train['comment_count_6h'] - train['comment_count_1h'])/5
test['comment_diff_1'] = (test['comment_count_2h'] - test['comment_count_1h'])/1
test['comment_diff_2'] = (test['comment_count_3h'] - test['comment_count_1h'])/2
test['comment_diff_3'] = (test['comment_count_4h'] - test['comment_count_1h'])/3
test['comment_diff_4'] = (test['comment_count_5h'] - test['comment_count_1h'])/4
test['comment_diff_5'] = (test['comment_count_6h'] - test['comment_count_1h'])/5

2. 按讚數與留言數：後一小時減掉前一小時

In [47]:
train['like_minus_1'] = train['like_count_2h'] - train['like_count_1h']
train['like_minus_2'] = train['like_count_3h'] - train['like_count_2h']
train['like_minus_3'] = train['like_count_4h'] - train['like_count_3h']
train['like_minus_4'] = train['like_count_5h'] - train['like_count_4h']
train['like_minus_5'] = train['like_count_6h'] - train['like_count_5h']
test['like_minus_1'] = test['like_count_2h'] - test['like_count_1h']
test['like_minus_2'] = test['like_count_3h'] - test['like_count_2h']
test['like_minus_3'] = test['like_count_4h'] - test['like_count_3h']
test['like_minus_4'] = test['like_count_5h'] - test['like_count_4h']
test['like_minus_5'] = test['like_count_6h'] - test['like_count_5h']
train['comment_minus_1'] = train['comment_count_2h'] - train['comment_count_1h']
train['comment_minus_2'] = train['comment_count_3h'] - train['comment_count_2h']
train['comment_minus_3'] = train['comment_count_4h'] - train['comment_count_3h']
train['comment_minus_4'] = train['comment_count_5h'] - train['comment_count_4h']
train['comment_minus_5'] = train['comment_count_6h'] - train['comment_count_5h']
test['comment_minus_1'] = test['comment_count_2h'] - test['comment_count_1h']
test['comment_minus_2'] = test['comment_count_3h'] - test['comment_count_2h']
test['comment_minus_3'] = test['comment_count_4h'] - test['comment_count_3h']
test['comment_minus_4'] = test['comment_count_5h'] - test['comment_count_4h']
test['comment_minus_5'] = test['comment_count_6h'] - test['comment_count_5h']

3. 按讚數：截至這一小時的平均

In [48]:
train['like_avg_1'] = train['like_count_1h'] 
train['like_avg_2'] = (train['like_count_1h']+train['like_count_2h'])/2
train['like_avg_3'] = (train['like_count_1h']+train['like_count_2h']+train['like_count_3h'])/3
train['like_avg_4'] = (train['like_count_1h']+train['like_count_2h']+train['like_count_3h']+train['like_count_4h'])/4
train['like_avg_5'] = (train['like_count_1h']+train['like_count_2h']+train['like_count_3h']+train['like_count_4h']+train['like_count_5h'])/5
train['like_avg_6'] = (train['like_count_1h']+train['like_count_2h']+train['like_count_3h']+train['like_count_4h']+train['like_count_5h']+train['like_count_6h'])/6
test['like_avg_1'] = test['like_count_1h'] 
test['like_avg_2'] = (test['like_count_1h']+test['like_count_2h'])/2
test['like_avg_3'] = (test['like_count_1h']+test['like_count_2h']+test['like_count_3h'])/3
test['like_avg_4'] = (test['like_count_1h']+test['like_count_2h']+test['like_count_3h']+test['like_count_4h'])/4
test['like_avg_5'] = (test['like_count_1h']+test['like_count_2h']+test['like_count_3h']+test['like_count_4h']+test['like_count_5h'])/5
test['like_avg_6'] = (test['like_count_1h']+test['like_count_2h']+test['like_count_3h']+test['like_count_4h']+test['like_count_5h']+test['like_count_6h'])/6

4. 按讚數：三小時移動平均（第一小時和第二小時採用截至這一小時平均）

In [49]:
train['like_move_avg_1'] = train['like_count_1h']/3
train['like_move_avg_2'] = (train['like_count_1h']+train['like_count_2h'])/3
train['like_move_avg_3'] = (train['like_count_1h']+train['like_count_2h']+train['like_count_3h'])/3
train['like_move_avg_4'] = (train['like_count_2h']+train['like_count_3h']+train['like_count_4h'])/3
train['like_move_avg_5'] = (train['like_count_3h']+train['like_count_4h']+train['like_count_5h'])/3
train['like_move_avg_6'] = (train['like_count_4h']+train['like_count_5h']+train['like_count_6h'])/3
test['like_move_avg_1'] = test['like_count_1h']/3
test['like_move_avg_2'] = (test['like_count_1h']+test['like_count_2h'])/3
test['like_move_avg_3'] = (test['like_count_1h']+test['like_count_2h']+test['like_count_3h'])/3
test['like_move_avg_4'] = (test['like_count_2h']+test['like_count_3h']+test['like_count_4h'])/3
test['like_move_avg_5'] = (test['like_count_3h']+test['like_count_4h']+test['like_count_5h'])/3
test['like_move_avg_6'] = (test['like_count_4h']+test['like_count_5h']+test['like_count_6h'])/3

## Generate Training Set and Testing Set for the Model

In [87]:
x_train = np.array(train.drop(['title', 'created_at', 'like_count_24h'], axis=1))
y_train = np.array(train['like_count_24h'])
x_test = np.array(test.drop(['title', 'created_at', 'like_count_24h'], axis=1))
y_test = np.array(test['like_count_24h'])

# Standarlize all the feature
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

x_train shape: (50000, 55)
x_test shape: (10000, 55)


## DNN Model

In [77]:
def MAPE(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    ape = tf.abs((y_true - y_pred) / y_true)
    mape = tf.reduce_mean(ape)
    return mape

model = Sequential()

model.add(Dense(75, activation="relu", input_shape=x_train.shape[1:]))
model.add(Dense(100, activation="relu"))
model.add(Dense(180, activation="relu"))
model.add(Dense(250, activation="relu"))
model.add(Dense(500, activation="relu"))
model.add(Dense(350, activation="relu"))
model.add(Dense(200, activation="relu"))
model.add(Dense(100, activation="relu"))
model.add(Dense(48, activation="relu"))
model.add(Dense(16, activation="relu"))
model.add(Dense(4, activation="relu"))
model.add(Dense(1))

model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001), metrics=[MAPE])

model.summary()

checkpoint = ModelCheckpoint(filepath='best_model.h5',  # 存放最佳模型的檔案名稱
                             monitor='val_MAPE',  # 監視的指標
                             save_best_only=True,  # 只保存最佳模型
                             save_weights_only=False,  # 是否只保存模型的權重
                             mode='min',  # 監視指標的模式，這裡使用 'min' 表示監視損失函數值的最小值
                             verbose=1)  # 顯示保存模型的信息

early_stopping = EarlyStopping(monitor='val_MAPE', patience=60)

model.fit(x_train, y_train, epochs=600, batch_size=128, shuffle=True, validation_data=(x_test, y_test), callbacks=[early_stopping, checkpoint])
model.load_weights('best_model.h5')

Model: "sequential_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_219 (Dense)           (None, 75)                4200      
                                                                 
 dense_220 (Dense)           (None, 100)               7600      
                                                                 
 dense_221 (Dense)           (None, 180)               18180     
                                                                 
 dense_222 (Dense)           (None, 250)               45250     
                                                                 
 dense_223 (Dense)           (None, 500)               125500    
                                                                 
 dense_224 (Dense)           (None, 350)               175350    
                                                                 
 dense_225 (Dense)           (None, 200)             

## Evaluation

In [89]:
def MAPE(predict, real):
    validation_num = predict.shape[0]
    total = 0
    for i in range(validation_num):
        total += abs(predict[i]-real[i])/real[i]
    return total/validation_num

pre_train = model.predict(x_train)
pre_test = model.predict(x_test)

print('訓練集: ',MAPE(pre_train, y_train))
print('測試集: ',MAPE(pre_test, y_test))

訓練集:  [0.28179666]
測試集:  [0.315641]


## Predict

In [91]:
private_test = pd.read_csv('./data/intern_homework_private_test_dataset.csv')
private_test['title_hash'] = private_test['title'].map(lambda x : title_into_hash(x, vectorizer))
private_test['weekday'] = private_test['created_at'].map(lambda x : weekday(x))
private_test['time_interval'] = private_test['created_at'].map(lambda x : time_interval(x))
private_test['title_len'] = private_test['title'].map(lambda x : len_of_text(x))
private_test['sub'] = private_test['title'].map(lambda x : sub(x))
private_test['pol'] = private_test['title'].map(lambda x : pol(x))
private_test['title_has_emoji'] = private_test['title'].map(lambda x : has_emoji(x))
private_test['title_has_number'] = private_test['title'].map(lambda x : has_number(x))
private_test['like_diff_1'] = (private_test['like_count_2h'] - private_test['like_count_1h'])/1
private_test['like_diff_2'] = (private_test['like_count_3h'] - private_test['like_count_1h'])/2
private_test['like_diff_3'] = (private_test['like_count_4h'] - private_test['like_count_1h'])/3
private_test['like_diff_4'] = (private_test['like_count_5h'] - private_test['like_count_1h'])/4
private_test['like_diff_5'] = (private_test['like_count_6h'] - private_test['like_count_1h'])/5
private_test['comment_diff_1'] = (private_test['comment_count_2h'] - private_test['comment_count_1h'])/1
private_test['comment_diff_2'] = (private_test['comment_count_3h'] - private_test['comment_count_1h'])/2
private_test['comment_diff_3'] = (private_test['comment_count_4h'] - private_test['comment_count_1h'])/3
private_test['comment_diff_4'] = (private_test['comment_count_5h'] - private_test['comment_count_1h'])/4
private_test['comment_diff_5'] = (private_test['comment_count_6h'] - private_test['comment_count_1h'])/5
private_test['like_minus_1'] = private_test['like_count_2h'] - private_test['like_count_1h']
private_test['like_minus_2'] = private_test['like_count_3h'] - private_test['like_count_2h']
private_test['like_minus_3'] = private_test['like_count_4h'] - private_test['like_count_3h']
private_test['like_minus_4'] = private_test['like_count_5h'] - private_test['like_count_4h']
private_test['like_minus_5'] = private_test['like_count_6h'] - private_test['like_count_5h']
private_test['comment_minus_1'] = private_test['comment_count_2h'] - private_test['comment_count_1h']
private_test['comment_minus_2'] = private_test['comment_count_3h'] - private_test['comment_count_2h']
private_test['comment_minus_3'] = private_test['comment_count_4h'] - private_test['comment_count_3h']
private_test['comment_minus_4'] = private_test['comment_count_5h'] - private_test['comment_count_4h']
private_test['comment_minus_5'] = private_test['comment_count_6h'] - private_test['comment_count_5h']
private_test['like_avg_1'] = private_test['like_count_1h'] 
private_test['like_avg_2'] = (private_test['like_count_1h']+private_test['like_count_2h'])/2
private_test['like_avg_3'] = (private_test['like_count_1h']+private_test['like_count_2h']+private_test['like_count_3h'])/3
private_test['like_avg_4'] = (private_test['like_count_1h']+private_test['like_count_2h']+private_test['like_count_3h']+private_test['like_count_4h'])/4
private_test['like_avg_5'] = (private_test['like_count_1h']+private_test['like_count_2h']+private_test['like_count_3h']+private_test['like_count_4h']+private_test['like_count_5h'])/5
private_test['like_avg_6'] = (private_test['like_count_1h']+private_test['like_count_2h']+private_test['like_count_3h']+private_test['like_count_4h']+private_test['like_count_5h']+private_test['like_count_6h'])/6
private_test['like_move_avg_1'] = private_test['like_count_1h']/3
private_test['like_move_avg_2'] = (private_test['like_count_1h']+private_test['like_count_2h'])/3
private_test['like_move_avg_3'] = (private_test['like_count_1h']+private_test['like_count_2h']+private_test['like_count_3h'])/3
private_test['like_move_avg_4'] = (private_test['like_count_2h']+private_test['like_count_3h']+private_test['like_count_4h'])/3
private_test['like_move_avg_5'] = (private_test['like_count_3h']+private_test['like_count_4h']+private_test['like_count_5h'])/3
private_test['like_move_avg_6'] = (private_test['like_count_4h']+private_test['like_count_5h']+private_test['like_count_6h'])/3
private_test = private_test.drop(['title', 'created_at'], axis=1)
private_test = np.array(private_test)
private_test = scaler.transform(private_test)
print('private_test shape:', private_test.shape)
predict_private_test = model.predict(private_test)
result = pd.DataFrame(predict_private_test)
result.columns = ['like_count_24h']
result['like_count_24h'] = result['like_count_24h'].map(lambda x: round(x))
result.to_csv('data/result.csv', index = False)

private_test shape: (10000, 55)
