In [1]:
import os
import json_lines
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

In [2]:
# If it does not exist, download the clickbait training dataset
! [[ ! -d "data" ]] && mkdir data
! [[ ! -d "data/clickbait17-validation-170630" ]] && wget "https://zenodo.org/record/3346491/files/clickbait17-train-170630.zip?download=1" -O data/temp.zip && unzip -q data/temp.zip -d data && rm data/temp.zip && rm -r data/clickbait17-validation-170630/media

In [3]:
def load_data(path):
    instance_file = open(os.path.join(path, 'instances.jsonl'),'rb')
    truth_file = open(os.path.join(path, 'truth.jsonl'),'rb')
    j_instance = []
    j_truth = []
    for item in json_lines.reader(instance_file):
        j_instance.append(item)
    
    for item in json_lines.reader(truth_file):
        j_truth.append(item)
    
    df_instance = pd.DataFrame(j_instance)
    df_truth = pd.DataFrame(j_truth)
    instance_file.close()
    truth_file.close()
    return df_instance, df_truth

In [4]:
# Load data from downloaded file
df, df_truth = load_data('data/clickbait17-validation-170630/')

In [5]:
# Drop the post media columns as we do not use this in the project
df = df.drop('postMedia',1)
df = df.drop('targetCaptions', 1)

In [6]:
stop_words = set(stopwords.words('english'))

def clean_column(column):
    column = [re.sub('[^a-zA-Z0-9 ]', '', str(x)) for x in column]
    column = [re.sub('xa0', ' ', str(x)) for x in column]
    column = [str(x).lower() for x in column]
    column = [' '.join([str(word) for word in word_tokenize(sentence) if not word in stop_words]) for sentence in column]
    
    return column

In [7]:
# Remove unecessary tokens from the columns of the dataframe
df['postText'] = clean_column(df['postText'])
df['targetParagraphs'] = clean_column(df['targetParagraphs'])
df['targetTitle'] = clean_column(df['targetTitle'])
df['targetDescription'] = clean_column(df['targetDescription'])
df['targetKeywords'] = clean_column(df['targetKeywords'])
df['Day'] = [x[:3] for x in df['postTimestamp']]
df['Hour'] = [int(x[11:13]) for x in df['postTimestamp']]

In [8]:
def time_class(time):
    if time >= 4 and time <12:
        return 'Morning'
    elif time >= 12 and time <17:
        return 'Afternoon'
    elif time >= 17 and time <20:
        return 'Evening'
    else:
        return 'Night'

In [9]:
# Add classification for the time of day each post
df['TimeClass'] = [time_class(hour) for hour in df['Hour']]

In [10]:
# Merge the instance data with the truth scores
df = pd.merge(df,df_truth,left_on='id',right_on='id')

In [11]:
# Save df as csv
df.to_csv('data/cleaned_clickbait.csv',index=False)