# ClickBait

In [1]:
import numpy as np, pandas as pd
import os, sys
import math
import shutil
import zipfile
import string
import random

# Paths and Variables

In [2]:
dataset_name = 'clickbait'

In [3]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')

In [4]:
id_col = "id"
target_col = "class"
text_col = "text"

In [5]:
classes_and_files = [['non_clickbait', 'non_clickbait_data'], ['clickbait', 'clickbait_data']]

# Read data into a DataFrame

In [7]:
data_retention_perc = 0.33

test_perc = 0.1

num_lines_to_read =  1500000
testing_key = []
all_data = []
for class_, f in classes_and_files:
    # input file path
    inputf_full_path = os.path.join(input_dir, f)
    # print(inputf_full_path)
        
    class_file_num = 0
    with open(inputf_full_path, encoding="utf-8") as inp_f:
        for i in range(num_lines_to_read):
            line = inp_f.readline().strip()
            if line:  
                all_data.append([class_, line])                     
    print("Done with class", class_)    
    

data = pd.DataFrame(all_data, columns=[target_col, text_col])
print(data.head())
print(data.shape)

Done with class non_clickbait
Done with class clickbait
           class                                               text
0  non_clickbait  Bill Changing Credit Card Rules Is Sent to Oba...
1  non_clickbait  In Hollywood, the Easy-Money Generation Toughe...
2  non_clickbait  1700 runners still unaccounted for in UK's Lak...
3  non_clickbait  Yankees Pitchers Trade Fielding Drills for Put...
4  non_clickbait  Large earthquake rattles Indonesia; Seventh in...
(32000, 2)


# Shuffle Data

In [8]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,class,text
23100,clickbait,"Inside The Most OMG Scene In ""Creed"""
17362,clickbait,When Do Women Decide To Have Sex
8993,non_clickbait,Guinean military leader in 'favourable' condit...
19566,clickbait,How 2015 Were You
3798,non_clickbait,America's Cup: Team New Zealand wins over Alin...


# Insert Id Column

In [9]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

       id          class                                               text
23100   0      clickbait               Inside The Most OMG Scene In "Creed"
17362   1      clickbait                   When Do Women Decide To Have Sex
8993    2  non_clickbait  Guinean military leader in 'favourable' condit...
19566   3      clickbait                                  How 2015 Were You
3798    4  non_clickbait  America's Cup: Team New Zealand wins over Alin...


# Utility to Save DF as a zipped file

In [10]:
def save_df_to_zipped_csv(df, ftype=None): 
    if ftype is not None: 
        suffix = f'_{ftype}'
    else: 
        suffix = ''
        
    zipped_f_name = f'{dataset_name}{suffix}.zip'
    archive_f_name = f'{dataset_name}{suffix}.csv'   
    compression_opts = dict(method='zip',
                        archive_name=archive_f_name)      
    df.to_csv(os.path.join(output_dir, zipped_f_name), index=False, compression=compression_opts )

# Save Main Data File

In [11]:
# # save original file as csv
# data.to_csv(outp_fname, index=False)

# save as zipped file 
save_df_to_zipped_csv(data)

# Train Test Split

In [12]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)


data_test_key = data_test[[id_col, target_col]].copy()
data_test = data_test.drop(columns=[target_col])

# # Save original files as csv
# data_train.to_csv(outp_train_fname, index=False)
# data_test.to_csv(outp_test_fname, index=False)
# data_test_key.to_csv(outp_test_key_fname, index=False)

(28800, 3) (3200, 3)


In [None]:
# zip files
save_df_to_zipped_csv(data_train, "train")
save_df_to_zipped_csv(data_test, "test")
save_df_to_zipped_csv(data_test_key, "test_key")