In [150]:
import os
import json_lines
import pandas as pd
import re
import gensim.parsing.preprocessing

In [151]:
# If it does not exist, download the clickbait training dataset
! [[ ! -d "data" ]] && mkdir data
! [[ ! -d "data/clickbait17-validation-170630" ]] && wget "https://zenodo.org/record/3346491/files/clickbait17-train-170630.zip?download=1" -O data/temp.zip && unzip -q data/temp.zip -d data && rm data/temp.zip && rm -r data/clickbait17-validation-170630/media

In [152]:
def load_data(path):
    instance_file = open(os.path.join(path, 'instances.jsonl'),'rb')
    truth_file = open(os.path.join(path, 'truth.jsonl'),'rb')
    j_instance = []
    j_truth = []
    for item in json_lines.reader(instance_file):
        j_instance.append(item)
    
    for item in json_lines.reader(truth_file):
        j_truth.append(item)
    
    df_instance = pd.DataFrame(j_instance)
    df_truth = pd.DataFrame(j_truth)
    instance_file.close()
    truth_file.close()
    return df_instance, df_truth

In [153]:
# Load data from downloaded file
df, df_truth = load_data('data/clickbait17-validation-170630/')

In [154]:
# Drop the post media columns as we do not use this in the project
df = df.drop('postMedia',1)
df = df.drop('targetCaptions', 1)

In [155]:
def clean_column(column):
    column = [str(x) for x in column]
    column = gensim.parsing.preprocessing.preprocess_documents(column)
    column = [" ".join(x) for x in column]
    
    return column

In [159]:
# Remove unecessary tokens from the columns of the dataframe
df['postText'] = clean_column(df['postText'])
df['targetParagraphs'] = clean_column(df['targetParagraphs'])
df['targetTitle'] = clean_column(df['targetTitle'])
df['targetDescription'] = clean_column(df['targetDescription'])
df['Day'] = [x[:3] for x in df['postTimestamp']]
df['Hour'] = [int(x[11:13]) for x in df['postTimestamp']]

# df['postText'] = [re.sub('[^a-zA-Z0-9 ]','',str(x)) for x in df['postText']]
# df['postText'] = [re.sub('xa0',' ',str(x)) for x in df['postText']]

# df['targetParagraphs'] = [re.sub('[^a-zA-Z0-9 ]','',str(x)) for x in df['targetParagraphs']]
# df['targetParagraphs'] = [re.sub('xa0',' ',str(x)) for x in df['targetParagraphs']]

# df['targetTitle'] = [re.sub('[^a-zA-Z0-9 ]',' ',str(x)) for x in df['targetTitle']]
# df['targetTitle'] = [re.sub('xa0',' ',str(x)) for x in df['targetTitle']]

# df['targetDescription'] = [re.sub('[^a-zA-Z0-9 ]',' ',str(x)) for x in df['targetDescription']]
# df['targetDescription'] = [re.sub('xa0',' ',str(x)) for x in df['targetDescription']]
df

Unnamed: 0,postText,id,targetParagraphs,targetTitle,postTimestamp,targetKeywords,targetDescription,Day,Hour
0,uk’ respon modern slaveri leav victim destitut...,858462320779026433,thousand modern slaveri victim xanot come forw...,‘inexcusable’ failur uk’ respon modern slaveri...,Sat Apr 29 23:25:41 +0000 2017,"modern slavery, Department For Work And Pensio...",“inexcusable” failur uk’ deal modern slaveri l...,Sat,23
1,good,858421020331560960,presid donald trump appoint xapro life advoc p...,donald trump appoint pro life advoc assist sec...,Sat Apr 29 20:41:34 +0000 2017,"Americans United for Life, Dr. Charmaine Yoest...",presid donald trump appoint pro life advoc pre...,Sat,20
2,forgotten trump roast reliv brutal thrash new ...,858368123753435136,xawhit hou correspondents’ dinner xamost enter...,‘forgotten’ trump roast reliv brutal thrash ne...,Sat Apr 29 17:11:23 +0000 2017,"trump whcd, whcd, white house correspondents d...",presid trump won year white hou correspond din...,Sat,17
3,meet happiest dog world,858323428260139008,ador probabl understat ador huski goe maru sha...,meet happiest dog world maru huski look like p...,Sat Apr 29 14:13:46 +0000 2017,"Maru, husky, dogs, pandas, furball, instagram",articl maru huski dog uncanni resembl panda,Sat,14
4,tokyo subwai shut amid fear immin north korean...,858283602626347008,tokyo major subwai sai shut line minut receiv ...,tokyo subwai shut amid fear immin north korean...,Sat Apr 29 11:35:31 +0000 2017,"Tokyo,subway,shut,fears,North,Korean,attack",temporari suspen minut affect peopl servic hal...,Sat,11
...,...,...,...,...,...,...,...,...,...
19533,brazil soccer team pilot final interview plane...,804250183642976256,watch live xajo biden honor senat floor brief ...,nbc new video brazil soccer team pilot’ final ...,Thu Dec 01 09:06:00 +0000 2016,,nbc new,Thu,9
19534,😱😱😱😱😱😱😱😱😱😱😱😱😱😱,804156272086020096,novemb politico report eric trump kill deer re...,politico scoop eric trump kill deer,Thu Dec 01 02:52:50 +0000 2016,Politico Scoop: Eric Trump Killed Two Deer,politico scoop eric trump kill deer,Thu,2
19535,french forest high school wai high rise build ...,804149798651588608,forest high school sydnei northern beach wai h...,french forest high school reloc wai high rise ...,Thu Dec 01 02:27:07 +0000 2016,"frenchs forest, northern beaches, sydney, rede...",forest high school sydnei northern beach wai h...,Thu,2
19536,jeff… bruh,804134698729385984,nfl coach lot inform rememb understand confu a...,angel ram jeff fisher think danni woodhead pla...,Thu Dec 01 01:27:06 +0000 2016,"Humor, Football, NFL, NFC West, Los Angeles Ra...",angel ram new rumor score schedul predict pick...,Thu,1


In [160]:
def time_class(time):
    if time >= 4 and time <12:
        return 'Morning'
    elif time >= 12 and time <17:
        return 'Afternoon'
    elif time >= 17 and time <20:
        return 'Evening'
    else:
        return 'Night'

In [161]:
# Add classification for the time of day each post
df['TimeClass'] = [time_class(hour) for hour in df['Hour']]

In [162]:
# Merge the instance data with the truth scores
df = pd.merge(df,df_truth,left_on='id',right_on='id')

In [163]:
df.to_csv('data/cleaned_clickbait.csv',index=False)