<a href="https://colab.research.google.com/github/rahul94jh/MSC-Research/blob/main/stop_clicbait_save_as_tfrec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The dataset is avaialable at https://github.com/bhargaviparanjape/clickbait 
This dataset was released as part of the research [Stop Clickbait: Detecting and Preventing Clickbaits in Online News Media](https://arxiv.org/abs/1610.09786)

Please check the tutorial for using the dataset at https://amitness.com/2020/02/tensorflow-hub-for-transfer-learning/ , we will use the URL provided at this website to download the dataset, http://bit.ly/clickbait-data

#Import libraries

In [102]:
import os, math
import numpy as np
import pandas as pd
import requests
import shutil
import re
from pathlib import Path

from matplotlib import pyplot as plt
import tensorflow as tf
print("Tensorflow version " + tf.__version__)
AUTO = tf.data.experimental.AUTOTUNE # used in tf.data.Dataset API


Tensorflow version 2.5.0


#Configurations

In [103]:
tfrec_root_path = '/content/drive/MyDrive/Colab Notebooks/clcikbait_detection/dataset/Stop_clickbait/tfrec_data'
tfrec_output_path = os.path.join(tfrec_root_path,'stop_clickbait_text')

In [104]:
CLASSES = [b'nonclickbaits', b'clickbaits'] 
SHARDS = 64

#Scripts

In [105]:
#@title "Utilities [RUN ME]"
def preprocess_string(text):
    text_cleaned = re.sub(r"http\S+", "", text, flags=re.IGNORECASE|re.MULTILINE)  # remove any url in the text
    text_cleaned = re.sub(r"\w+@\w{1,}\.\w{1,}","", text_cleaned, flags=re.IGNORECASE|re.MULTILINE) # remove emails from text
    text_cleaned = re.sub(r'[^a-z\s]+',' ',text_cleaned,flags=re.IGNORECASE|re.MULTILINE) # remove non alphabetics char
    text_cleaned = re.sub(r'(\s+)',' ',text_cleaned, flags=re.IGNORECASE|re.MULTILINE)  # remove spaces

    text_cleaned = text_cleaned.lstrip()
    text_cleaned = text_cleaned.rstrip()
    text_cleaned = text_cleaned.lower() 

    return text_cleaned 

In [106]:
#@title "Utilities [RUN ME]"
# Three types of data can be stored in TFRecords: bytestrings, integers and floats
# They are always stored as lists, a single data element will be a list of size 1

def _bytestring_feature(list_of_bytestrings):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _int_feature(list_of_ints): # int64
  return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def _float_feature(list_of_floats): # float32
  return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))
  

def to_tfrecord(tfrec_filewriter, text, label):  
  class_num = label 

  feature = {
      "class": _int_feature([class_num]),        # one class in the list
      "text":  _bytestring_feature([text]),       # text feature   
      "label":         _bytestring_feature([CLASSES[label]]),          # fixed length (1) list of strings, the text label

  }
  return tf.train.Example(features=tf.train.Features(feature=feature))
  


In [107]:
#@title "Utilities [RUN ME]"
def WriteAsTFRecord(dataset, tfrec_root_path, tfrec_output_path):
  print("Writing TFRecords")

  if not os.path.exists(tfrec_root_path):
    os.mkdir(tfrec_root_path)

  for shard, (text, label) in enumerate(dataset):
    # batch size used as shard size here
    shard_size = text.numpy().shape[0]
    # good practice to have the number of records in the filename
    filename = tfrec_output_path + "{:02d}-{}.tfrec".format(shard, shard_size)
  
    with tf.io.TFRecordWriter(filename) as out_file:
      for i in range(shard_size):
        example = to_tfrecord(out_file,
                            text.numpy()[i],
                            label.numpy()[i])
        out_file.write(example.SerializeToString())
      print("Wrote file {} containing {} records".format(filename, shard_size))

In [127]:
#@title "Utilities [RUN ME]"
def read_tfrecord(example):
    features = {
        "class": tf.io.FixedLenFeature([], tf.int64),   # shape [] means scalar
        "text": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.FixedLenFeature([], tf.string)  # one bytestring
    }
    # decode the TFRecord
    example = tf.io.parse_single_example(example, features)
    
    
    class_num = example['class']
    text = example['text']
    label  = example['label']
    return text, class_num, label


#Load data

In [109]:
import pandas as pd
df = pd.read_csv('http://bit.ly/clickbait-data')

In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31986 entries, 0 to 31985
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   31986 non-null  object
 1   label   31986 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 499.9+ KB


In [111]:
df.sample(10)

Unnamed: 0,title,label
1453,GFs Shaved Their BFs' Faces... And Blood Was E...,1
1724,"Bit by Careful Bit, Obama Toughens Stance on Iran",0
2595,27 Of The Most Awesome Trans Moments In 2015,1
21168,Second-Half Defensive Stand Helps No. 2 Oklaho...,0
29273,Cheap Vs. Expensive Vodka Taste Test,1
21469,"Justin Bieber Called Bette Midler ""Britt Meddl...",1
8668,32 Things Karl Pilkington Has 100% Actually Said,1
29191,"If You Aren't Already Terrified Of Pigeons, Th...",1
9843,Kim Kardashian Just Revealed She Uses Tons Of ...,1
4740,"UConn Nears Perfection, but Has Room to Improve",0


In [112]:
# We have equal distribution for clickbait and non-clickbait news headlines, thats good for us.
df.label.value_counts(True)

0    0.500219
1    0.499781
Name: label, dtype: float64

In [113]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/clcikbait_detection/dataset/Stop_clickbait/stop_clickbait.csv', index=False)

#Process dataset

In [114]:
df.shape, df.columns

((31986, 2), Index(['title', 'label'], dtype='object'))

In [115]:
df = df.rename(columns={"title": "text", "label": "is_clickbait"})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31986 entries, 0 to 31985
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   text          31986 non-null  object
 1   is_clickbait  31986 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 499.9+ KB


In [116]:
df['text'] = df['text'].apply(lambda t: preprocess_string(t))

In [117]:
df = df.sample(frac=1).reset_index()

In [118]:
Y = df.pop('is_clickbait')
X_TEXT = df['text']

In [119]:
Y = np.array(Y)
X_TEXT = np.array(X_TEXT)
Y.shape, X_TEXT.shape

((31986,), (31986,))

#Create TF Dataset

In [120]:
dataset = tf.data.Dataset.from_tensor_slices((X_TEXT, Y))

In [121]:
for i, (text, label) in enumerate(dataset.take(9)):
  print(f'text: {text.numpy()} : is_clickbait : {CLASSES[label.numpy()]}')
  

text: b'internet posting says al qaeda plans dirty bomb attack in new york city' : is_clickbait : b'nonclickbaits'
text: b'leisure taiwan launched in taipei world trade center' : is_clickbait : b'nonclickbaits'
text: b'voluntary student unionism bill passes australian house of representatives enters senate' : is_clickbait : b'nonclickbaits'
text: b'the one thing you never noticed about home alone' : is_clickbait : b'clickbaits'
text: b'print version of blender to cease publication' : is_clickbait : b'nonclickbaits'
text: b'take a look back at the biggest black friday toys of the past years' : is_clickbait : b'clickbaits'
text: b'puff pastry recipes that will make every meal a party' : is_clickbait : b'clickbaits'
text: b'this guy pranked his girlfriend every time he took her photo by filming a video instead' : is_clickbait : b'clickbaits'
text: b'tornado touches down in joplin missouri' : is_clickbait : b'nonclickbaits'


#Write data to TFRecord files

In [122]:
nb_texts = len(dataset)
shard_size = math.ceil(1.0 * nb_texts / SHARDS)
print("Pattern matches {} Texts which will be rewritten as {} .tfrec files containing {} texts each.".format(nb_texts, SHARDS, shard_size))

Pattern matches 31986 Texts which will be rewritten as 64 .tfrec files containing 500 texts each.


In [123]:
dataset = dataset.batch(shard_size)

In [125]:
WriteAsTFRecord(dataset, tfrec_root_path, tfrec_output_path)

Writing TFRecords
Wrote file /content/drive/MyDrive/Colab Notebooks/clcikbait_detection/dataset/Stop_clickbait/tfrec_data/stop_clickbait_text00-500.tfrec containing 500 records
Wrote file /content/drive/MyDrive/Colab Notebooks/clcikbait_detection/dataset/Stop_clickbait/tfrec_data/stop_clickbait_text01-500.tfrec containing 500 records
Wrote file /content/drive/MyDrive/Colab Notebooks/clcikbait_detection/dataset/Stop_clickbait/tfrec_data/stop_clickbait_text02-500.tfrec containing 500 records
Wrote file /content/drive/MyDrive/Colab Notebooks/clcikbait_detection/dataset/Stop_clickbait/tfrec_data/stop_clickbait_text03-500.tfrec containing 500 records
Wrote file /content/drive/MyDrive/Colab Notebooks/clcikbait_detection/dataset/Stop_clickbait/tfrec_data/stop_clickbait_text04-500.tfrec containing 500 records
Wrote file /content/drive/MyDrive/Colab Notebooks/clcikbait_detection/dataset/Stop_clickbait/tfrec_data/stop_clickbait_text05-500.tfrec containing 500 records
Wrote file /content/drive/My

#Read TFRecord files into TF Dataset 

In [136]:
# read from TFRecords. For optimal performance, read from multiple
# TFRecord files at once and set the option experimental_deterministic = False
# to allow order-altering optimizations.

option_no_order = tf.data.Options()
option_no_order.experimental_deterministic = False

filenames = tf.io.gfile.glob(tfrec_output_path + "*.tfrec")
type(filenames)
#len(filenames)
filenames[0]

'/content/drive/MyDrive/Colab Notebooks/clcikbait_detection/dataset/Stop_clickbait/tfrec_data/stop_clickbait_text00-500.tfrec'

In [131]:
dataset_r = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO)
dataset_r = dataset_r.with_options(option_no_order)
dataset_r = dataset_r.map(read_tfrecord, num_parallel_calls=AUTO)
# For true randomness, we set the shuffle buffer to the full dataset size.
dataset_r = dataset_r.shuffle(nb_texts)

In [133]:
for i, (text, class_num, label) in enumerate(dataset_r.take(10)):
  print(f'text: {text.numpy()} : is_clickbait : {bool(class_num.numpy)}')
  

text: b'moldovan wines win three medals at contest in bordeaux' : is_clickbait : True
text: b'two pilots dead in richmond plane crash' : is_clickbait : True
text: b'will smith appeared on good morning america to talk about the oscars' : is_clickbait : True
text: b'zac efron and the rock are looking extra fine on the set of baywatch' : is_clickbait : True
text: b'this hunger games interpretation will change the way you see the mockingjay' : is_clickbait : True
text: b'tour de taiwan stage european american cyclists rise up' : is_clickbait : True
text: b'someone at the all india bakchod office has lost their damn mind' : is_clickbait : True
text: b'canadian cops are giving out the most dad winter driving advice on facebook' : is_clickbait : True
text: b'how adulty are you' : is_clickbait : True
text: b'stevie wonder took james corden on the best carpool karaoke ride ever' : is_clickbait : True
