In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
from datetime import datetime
from string import punctuation 
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, concatenate,LayerNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from tensorflow import keras
from sklearn import metrics

In [2]:
seed = 71351825
dataset = pd.read_csv('data/youtube_dislike_dataset.csv')
dataset.shape

(37422, 12)

In [3]:
#select data to use
file_US_ids = open("data/unique_ids_US.txt", "r")
US_ids = file_US_ids.read().splitlines()
dataset = dataset[dataset['video_id'].isin(US_ids)]
dataset = dataset[['view_count', 'published_at','likes', 'comment_count','tags','description','dislikes']]
dataset.head()

Unnamed: 0,view_count,published_at,likes,comment_count,tags,description,dislikes
1,15352638,2021-06-10 16:00:00,359277,18729,Migos Avalanche Quality Control Music/Motown R...,"Watch the the official video for Migos - ""Aval...",7479
2,925281,2021-09-20 01:03:32,11212,831,,Hannah Waddingham wins the Emmy for Supporting...,401
4,715724,2021-12-07 13:00:00,32887,1067,retaining wall New Jersey highway Direct Conne...,One of the most important (and innocuous) part...,367
5,36124750,2021-12-01 09:00:03,965069,59657,Kpop girl group 1theK Starshiptv starship MV...,IVE Twitter\n: https://twitter.com/IVEstarship...,16618
8,535044,2021-08-06 12:10:25,9207,1900,the breakfast club breakfast club power1051 ce...,Subscribe NOW to The Breakfast Club: http://ih...,384


In [4]:
#remove null values
dataset.replace(" ", np.nan, inplace=True)
dataset = dataset.dropna()
dataset.isna().sum()

view_count       0
published_at     0
likes            0
comment_count    0
tags             0
description      0
dislikes         0
dtype: int64

In [5]:
#time between video posted and data extracted.
def calTime(time):
  start = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
  end =   datetime.strptime('13/12/2021 00:00:00', '%d/%m/%Y %H:%M:%S') # assuming this is the date that this dataset was extracted
  return np.round((end - start).total_seconds() / 60, 2)

dataset['timesec'] = dataset['published_at'].apply(calTime)

In [6]:
#text cleaning init
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')

contraction_map={
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd've": "how did have",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "might have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "shall'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "will't've": "will not have",
    "would've": "would have",
    "would't": "would not",
    "would't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you have all",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have",
}

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\laush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\laush\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\laush\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\laush\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [7]:
#text cleaning function
lemmatizer = WordNetLemmatizer()
in_words = set(nltk.corpus.words.words())
def clean_text(text):
    text = str(text)
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub('"','', text)
    text = ' '.join([contraction_map[t] if t in contraction_map else t for t in text.split(" ")])
    text = re.sub(r"'s\b","",text)
    text = re.sub("[^a-zA-Z]", " ", text)
    text = " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in in_words or not w.isalpha())

    text = [word for word in text.split( ) if word not in stopwords.words('english')]
    text = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(text)
    text = text.lower()
    return text

In [8]:
%time dataset['clean_description'] = dataset['description'].apply(clean_text)
%time dataset['clean_tags'] = dataset['tags'].apply(clean_text)

  text = BeautifulSoup(text, "lxml").text
  text = BeautifulSoup(text, "lxml").text


CPU times: total: 3min 18s
Wall time: 3min 18s


  text = BeautifulSoup(text, "lxml").text


CPU times: total: 45.2 s
Wall time: 45.2 s


In [9]:
#data splitting
dataset_train = dataset.sample(frac = 0.9,random_state = seed)
dataset_test = dataset.drop(dataset_train.index)

X_train = dataset_train.loc[:, dataset.columns != 'dislikes']
Y_train = dataset_train['dislikes'].values
X_test = dataset_test.loc[:, dataset.columns != 'dislikes']
Y_test = dataset_test['dislikes'].values
X_train_numaric = X_train[['view_count', 'likes', 'comment_count', 'timesec']].values
X_train_tags = X_train['clean_tags'].values
X_train_desc = X_train['clean_description'].values
X_test_numaric = X_test[['view_count', 'likes', 'comment_count', 'timesec']].values
X_test_tags = X_test['clean_tags'].values
X_test_desc = X_test['clean_description'].values

In [10]:
#tokenize text data(split data into smaller chunks for machine learning)
def Tokenizer_func(train,test, max_words_length=0, max_seq_len=100):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train)
    
    max_words = 0
    if max_words_length > 0:
        max_words = max_words_length
    else:
        max_words = len(tokenizer.word_counts.items())

    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train)

    train_sequences = tokenizer.texts_to_sequences(train)
    test_sequences = tokenizer.texts_to_sequences(test)
    
    train = pad_sequences(train_sequences,maxlen=max_seq_len, padding='post')
    test = pad_sequences(test_sequences,maxlen=max_seq_len, padding='post')
    
    voc = tokenizer.num_words +1
    return {'train': train, 'test': test, 'voc': voc, 'max_words':max_words, 'tokenizer': tokenizer}
tags_later = X_train_tags.copy()
desc_later = X_train_desc.copy()

X_tags_processed = Tokenizer_func(X_train_tags,X_test_tags)
X_train_tags,X_test_tags,x_tags_voc,x_tags_max_words,x_tag_tok = X_tags_processed['train'], X_tags_processed['test'],X_tags_processed['voc'],X_tags_processed['max_words'],X_tags_processed['tokenizer']
X_desc_processed = Tokenizer_func(X_train_desc,X_test_desc)
X_train_desc, X_test_desc,x_desc_voc,x_desc_max_words,x_desc_tok = X_desc_processed['train'], X_desc_processed['test'],X_desc_processed['voc'],X_desc_processed['max_words'],X_desc_processed['tokenizer']
X_train_tags = np.reshape(X_train_tags,(X_train_tags.shape[0],X_train_tags.shape[1],1))
X_train_desc = np.reshape(X_train_desc,(X_train_desc.shape[0],X_train_desc.shape[1],1))
X_train_tags.shape, X_train_desc.shape, X_train_numaric.shape

((12182, 100, 1), (12182, 100, 1), (12182, 4))

In [11]:
#normalize numerical data
Sc = StandardScaler()
X_train_numaric = Sc.fit_transform(X_train_numaric)
X_test_numaric = Sc.transform(X_test_numaric)

In [12]:
#define model
tagsInput = Input(shape=(None,), name='tags')
descInput = Input(shape=(None,), name='desc')
numaricInput = Input(shape=(4,), name='numaric')

tags = Embedding(input_dim=x_tags_voc,output_dim=8,input_length=x_tags_max_words)(tagsInput)
tags = LSTM(100,dropout=0.2, return_sequences=True)(tags)
tags = LayerNormalization()(tags)
tags = LSTM(100,dropout=0.4, return_sequences=False)(tags)
tags = LayerNormalization()(tags)

desc = Embedding(input_dim=x_desc_voc,output_dim=8,input_length=x_desc_max_words)(descInput)
desc = LSTM(100,dropout=0.2, return_sequences=True)(desc)
desc = LayerNormalization()(desc)
desc = LSTM(100,dropout=0.4, return_sequences=False)(desc)
desc = LayerNormalization()(desc)

combined = concatenate([tags, desc,numaricInput])
x = Dense(256,activation='relu')(combined)
x = Dense(128,activation='relu')(x)
x = Dense(32,activation='relu')(x)
x = Dense(1,use_bias=True,activation='linear')(x)
model = Model([tagsInput, descInput,numaricInput], x)
model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.001, decay=0.001 / 20), metrics=['mae'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 tags (InputLayer)              [(None, None)]       0           []                               
                                                                                                  
 desc (InputLayer)              [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 8)      70968       ['tags[0][0]']                   
                                                                                                  
 embedding_1 (Embedding)        (None, None, 8)      128728      ['desc[0][0]']                   
                                                                                              

In [13]:
#early stopping to avoid overfitting to training data and train model
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)
def train_and_save():
    history = model.fit(
                        x=[X_train_tags, X_train_desc,X_train_numaric],
                        y=Y_train,
                        epochs=500, 
                        batch_size=25,
                        validation_split=0.2,
                        verbose=1,
                        callbacks=[es])
    model.save('model/model')
#train_and_save()
model = tf.keras.models.load_model('model/model')

In [14]:
#prediction on test data
prediction = model.predict([X_test_tags, X_test_desc, X_test_numaric])

print("Mean Absolute Error (MAE) - Test data : ", metrics.mean_absolute_error(Y_test, prediction))
print("Mean Squared Error (MSE) - Test data : ", metrics.mean_squared_error(Y_test, prediction))
print("Root Mean Squared Error (RMSE) - Test data : ", np.sqrt(metrics.mean_squared_error(Y_test, prediction)))
print("Co-efficient of determination (R2 Score): ", metrics.r2_score(Y_test, prediction))

Mean Absolute Error (MAE) - Test data :  3029.742593649747
Mean Squared Error (MSE) - Test data :  171496561.4450279
Root Mean Squared Error (RMSE) - Test data :  13095.669568411839
Co-efficient of determination (R2 Score):  0.5509276035537167


In [15]:
#prediction on additional dataset
dataset = pd.read_csv('data/USvideos.csv')
dataset = dataset[['views', 'publish_time','likes', 'comment_count','tags','description','dislikes']]
dataset.rename(columns = {'views':'view_count', 'publish_time':'published_at'}, inplace = True)

dataset.head()

Unnamed: 0,view_count,published_at,likes,comment_count,tags,description,dislikes
0,748374,2017-11-13T17:13:01.000Z,57527,15954,SHANtell martin,SHANTELL'S CHANNEL - https://www.youtube.com/s...,2966
1,2418783,2017-11-13T07:30:00.000Z,97185,12703,"last week tonight trump presidency|""last week ...","One year after the presidential election, John...",6146
2,3191434,2017-11-12T19:05:24.000Z,146033,8181,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...,5339
3,343168,2017-11-13T11:00:04.000Z,10172,2146,"rhett and link|""gmm""|""good mythical morning""|""...",Today we find out if Link is a Nickelback amat...,666
4,2095731,2017-11-12T18:01:41.000Z,132235,17518,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",I know it's been a while since we did this sho...,1989


In [16]:
#remove null values
dataset.replace(" ", np.nan, inplace=True)
dataset = dataset.dropna()
dataset.isna().sum()


view_count       0
published_at     0
likes            0
comment_count    0
tags             0
description      0
dislikes         0
dtype: int64

In [17]:
#Redefine time as this dataset was collected at a different time
def calTime(time):
  start = datetime.strptime(time, '%Y-%m-%dT%H:%M:%S.000Z')
  end =   datetime.strptime('13/06/2019 00:00:00', '%d/%m/%Y %H:%M:%S') # assuming this is the date that this dataset was extracted
  return np.round((end - start).total_seconds() / 60, 2)

dataset['timesec'] = dataset['published_at'].apply(calTime)

In [18]:
%time dataset['clean_description'] = dataset['description'].apply(clean_text)
%time dataset['clean_tags'] = dataset['tags'].apply(clean_text)

  text = BeautifulSoup(text, "lxml").text
  text = BeautifulSoup(text, "lxml").text


CPU times: total: 9min 17s
Wall time: 9min 17s


  text = BeautifulSoup(text, "lxml").text
  text = BeautifulSoup(text, "lxml").text


CPU times: total: 2min 19s
Wall time: 2min 19s


In [19]:
#data splitting

X = dataset.loc[:, dataset.columns != 'dislikes']
Y = dataset['dislikes'].values

X_numaric = X[['view_count', 'likes', 'comment_count', 'timesec']].values
X_tags = X['clean_tags'].values
X_desc = X['clean_description'].values

In [20]:
#tokenize

X_tags_processed = Tokenizer_func(tags_later,X_tags)
X_train_tags,X_tags,x_tags_voc,x_tags_max_words,x_tag_tok = X_tags_processed['train'], X_tags_processed['test'],X_tags_processed['voc'],X_tags_processed['max_words'],X_tags_processed['tokenizer']
X_desc_processed = Tokenizer_func(desc_later,X_desc)
X_train_desc, X_desc,x_desc_voc,x_desc_max_words,x_desc_tok = X_desc_processed['train'], X_desc_processed['test'],X_desc_processed['voc'],X_desc_processed['max_words'],X_desc_processed['tokenizer']

In [21]:
#normalize numerical data
X_numaric = Sc.transform(X_numaric)

In [22]:
#prediction on data
prediction = model.predict([X_tags, X_desc, X_numaric])

print("Mean Absolute Error (MAE) - Test data : ", metrics.mean_absolute_error(Y, prediction))
print("Mean Squared Error (MSE) - Test data : ", metrics.mean_squared_error(Y, prediction))
print("Root Mean Squared Error (RMSE) - Test data : ", np.sqrt(metrics.mean_squared_error(Y, prediction)))
print("Co-efficient of determination (R2 Score): ", metrics.r2_score(Y, prediction))

Mean Absolute Error (MAE) - Test data :  3141.394176743855
Mean Squared Error (MSE) - Test data :  432500179.6998947
Root Mean Squared Error (RMSE) - Test data :  20796.638663493068
Co-efficient of determination (R2 Score):  0.19484647235245056


Overall Deep learning can get much better results as compared to linear regression in this problem and with more fine tuning of the model and more data, it has the potential to be even better.