Importing the clickbait and non-clickbait sets:

In [39]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [40]:
!pip install argparse
!pip install re
!pip install emoji
!pip install pickle
!pip install gensim

Collecting re
[31m  Could not find a version that satisfies the requirement re (from versions: )[0m
[31mNo matching distribution found for re[0m
Collecting pickle
[31m  Could not find a version that satisfies the requirement pickle (from versions: )[0m
[31mNo matching distribution found for pickle[0m


In [0]:
import pickle


youtube_click_bait_df = pickle.load(open("/content/drive/My Drive/youtube/data/clickbait-df", "rb"))
youtube_nonbait_data_df = pickle.load(open("/content/drive/My Drive/youtube/data/non-clickbait-df", "rb"))

In [42]:
youtube_click_bait_df.shape

(18317, 11)

In [43]:
youtube_nonbait_data_df.shape

(19080, 11)

Defining a custom tokenizer:

In [0]:
import re
import emoji
from gensim.parsing.preprocessing import *


def tokenize(string):

    """ Tokenizes a string.
    """
    stop_words = [
        "about", "an", "are", "as", "at", "be", "by", "com", "for", "from", "in", "is", "it", "of", "on", "or", "that",
        "the", "this", "to", "was", "what", "when", "where", "who", "with", "the", "www"
    ]

    string = strip_short(
        strip_multiple_whitespaces(
            strip_punctuation(
                split_alphanum(string))),
        minsize=2)
    
    # Parse emojis:
    emojis = [ c for c in string if c in emoji.UNICODE_EMOJI ]
    
    # Remove every non-word character and stem each word:
    string = stem_text(re.sub(r"[^\w\s,]", "", string))
    
    # List of stems and emojis:
    tokens = string.split() + emojis
    
    for stop_word in stop_words:
        try:
            tokens.remove(stop_word)
        except:
            pass

    return tokens

Add a column containing the tokenized titles:

In [0]:
youtube_click_bait_df["video_title_tokenized"] = youtube_click_bait_df["video_title"].apply(tokenize)
youtube_nonbait_data_df["video_title_tokenized"] = youtube_nonbait_data_df["video_title"].apply(tokenize)

Splitting the data into train and test set:

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split


youtube_click_bait_df["label"] = 1
youtube_nonbait_data_df["label"] = 0

# Consider the same number of samples (18000) for each class:
youtube_click_bait_df = youtube_click_bait_df.sample(frac=1).sample(frac=1).sample(n=18000)
youtube_nonbait_data_df = youtube_nonbait_data_df.sample(frac=1).sample(frac=1).sample(n=18000)

# Build the dataframe:
dataframe = pd.concat([ youtube_click_bait_df, youtube_nonbait_data_df ]).sample(frac=1).sample(frac=1)

x_youtube_train, X_test, y_train, youtube_y_testing = train_test_split(
    dataframe.loc[:, dataframe.columns != "label"], 
    dataframe["label"], 
    test_size=0.2, 
    random_state=42)

# Export them:
pickle.dump(x_youtube_train, open("/content/drive/My Drive/youtube/data/x-train", "wb"))
pickle.dump(y_train, open("/content/drive/My Drive/youtube/data/y-train", "wb"))
pickle.dump(X_test, open("/content/drive/My Drive/youtube/data/x-test", "wb"))
pickle.dump(youtube_y_testing, open("/content/drive/My Drive/youtube/data/y-test", "wb"))

In [47]:
x_youtube_train.shape

(28800, 12)

In [48]:
X_test.shape

(7200, 12)

In [49]:
X_test.head

<bound method NDFrame.head of                      channel_id               channel_name  \
7818   UCpprBWvibvmOlI8yJOEAAjA           Cooking with Dog   
9322   UCIsbLox_y9dCIMLd8tdC6qg                Vanity Fair   
15456  UCXIJgqnII2ZOINSWNOGFThA                   Fox News   
22874  UC4eR_m8Fl0bVB7P_BEzX3fw                     ApexTV   
7047   UCRijo3ddMTht_IHyNSNXpNQ               Dude Perfect   
15409  UC6H07z6zAwbHRl4Lbl0GSsw                   TechZone   
1747   UCqFzWxSCi39LnW1JKFR3efg        Saturday Night Live   
8499   UCPsil91i8gN0XLIbwl3vqsw              Lonely Planet   
10976  UCJx5KP-pCUmL9eZUv-mIcNw               GameTrailers   
19436  UCw7SNYrYei7F5ttQO3o-rpA              disneychannel   
6426   UCDVYQ4Zhbm3S2dlz7P1GBDg                        NFL   
23410  UCTTQAOiR_0DuyQPZ6Dg-LHA                  Talltanic   
6699   UCfM-x2VfpYypdekWVJKHN9g                 Funny Life   
1886   UC1WihDhMU5xLAdQ7-on3u0w          NEWS & CONSPIRACY   
7071   UCRijo3ddMTht_IHyNSNXpNQ         

Defining an embedding function:

In [0]:
import numpy as np


def average_embedding(tokens, word2vec, na_vector=None):

    vectors = list()

    for token in tokens:
        if token in word2vec:
            vectors.append(word2vec[token])

    if len(vectors) == 0 and na_vector is not None:
        vectors.append(na_vector)

    return np.mean(np.array(vectors), axis=0)

Training a Word2Vec model onto the train set:

In [0]:
import gensim


documents = x_youtube_train["video_title_tokenized"]
word2vec = gensim.models.Word2Vec(
    documents,
    size=25,
    window=20,
    min_count=1,
    workers=2
)
word2vec.train(documents, total_examples=len(documents), epochs=30)

# Export it:
pickle.dump(word2vec, open("/content/drive/My Drive/youtube/data/word2vec", "wb"))

Get the titles embeddings:

In [52]:
titles_embeddings = x_youtube_train["video_title_tokenized"].apply(average_embedding, word2vec=word2vec)
youtube_training_set = pd.concat(
    [
        x_youtube_train[["video_views", "video_likes", "video_dislikes", "video_comments"]],
        titles_embeddings.apply(pd.Series)
    ], axis=1)
# Add the label column:
youtube_training_set["label"] = y_train
# Drop rows with missing values:
youtube_training_set = youtube_training_set.dropna()

# Compute the average vector representation on the train set, and export it:
mean_title_embedding = titles_embeddings.dropna().mean(axis=0)
pickle.dump(mean_title_embedding, open("mean-title-embedding", "wb"))

# For the test set use the mean title embedding computed on the train set:
titles_embeddings = X_test["video_title_tokenized"].apply(average_embedding, word2vec=word2vec, na_vector=mean_title_embedding)
youtube_test_data_set = pd.concat(
    [
        X_test[["video_views", "video_likes", "video_dislikes", "video_comments"]],
        titles_embeddings.apply(pd.Series)
    ], axis=1)
youtube_test_data_set["label"] = youtube_y_testing

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  out=out, **kwargs)


In [53]:
youtube_training_set.shape

(28798, 30)

In [54]:
youtube_test_data_set.shape

(7200, 30)

Considering the logarithm of the video metadata (views, likes, dislikes, comments) to have a normal distribution of values:

In [0]:
# Compute the logarithm of the video metadata (likes, dislikes, comments, views)
youtube_training_set[["video_views", "video_likes", "video_dislikes", "video_comments"]] = youtube_training_set[["video_views", "video_likes", "video_dislikes", "video_comments"]].apply(np.log)
youtube_test_data_set[["video_views", "video_likes", "video_dislikes", "video_comments"]] = youtube_test_data_set[["video_views", "video_likes", "video_dislikes", "video_comments"]].apply(np.log)

# Replace any -Inf value with 0:
youtube_training_set = youtube_training_set.replace(-np.inf, 0)
youtube_test_data_set = youtube_test_data_set.replace(-np.inf, 0)

In [0]:
# Remove the label columns:
train_labels = youtube_training_set["label"]
test_labels = youtube_test_data_set["label"]

youtube_training_set = youtube_training_set.drop(columns=["label"])
youtube_test_data_set = youtube_test_data_set.drop(columns=["label"])

# Export the mean values of the metadata in the train set:
pickle.dump(youtube_training_set["video_views"].mean(), open("/content/drive/My Drive/youtube/data/mean-log-video-views", "wb"))
pickle.dump(youtube_training_set["video_likes"].mean(), open("/content/drive/My Drive/youtube/data/mean-log-video-likes", "wb"))
pickle.dump(youtube_training_set["video_dislikes"].mean(), open("/content/drive/My Drive/youtube/data/mean-log-video-dislikes", "wb"))
pickle.dump(youtube_training_set["video_comments"].mean(), open("/content/drive/My Drive/youtube/data/mean-log-video-comments", "wb"))

Train a min-max scaler onto the train set and apply it onto both sets:

In [0]:
from sklearn import preprocessing


min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler.fit(youtube_training_set)
youtube_training_set = pd.DataFrame(min_max_scaler.transform(youtube_training_set), columns=youtube_training_set.columns)
youtube_test_data_set = pd.DataFrame(min_max_scaler.transform(youtube_test_data_set), columns=youtube_test_data_set.columns)

# Export it:
pickle.dump(min_max_scaler, open("/content/drive/My Drive/youtube/data/min-max-scaler", "wb"))

Train a SVM model:

In [0]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score







In [0]:
svm_params = [
    { "C": np.linspace(1, 25, 10), "gamma": np.linspace(1, 5, 10) },
]

grid_search_cv = GridSearchCV(estimator=SVC(kernel="rbf"), param_grid=svm_params, n_jobs=2, scoring="f1", verbose=3)
grid_search_cv.fit(youtube_training_set, train_labels)

predictions = grid_search_cv.predict(youtube_test_data_set)

print("Best SVM with:")
print("\tC:", grid_search_cv.best_params_["C"])
print("\tgamma:", grid_search_cv.best_params_["gamma"])
print("\tBest Score (F1):", grid_search_cv.best_score_)
print("Performance on the test set (%d samples):" % len(youtube_test_data_set))
print("\tAccuracy Score:", accuracy_score(test_labels, predictions))
print("\tArea under ROC curve:", roc_auc_score(test_labels, predictions))
print("\tClassification report (on the test set):")
print(classification_report(test_labels, predictions))

# Export the best estimator:
pickle.dump(grid_search_cv.best_estimator_, open("/content/drive/My Drive/youtube/data/svm", "wb"))

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] C=1.0, gamma=1.0 ................................................
[CV] C=1.0, gamma=1.0 ................................................
