# TA NOTE
The data for this file may have been moved from data/ to data/TA_DATA 

# Setup

In [2]:
#Import the necessary libraries
import pandas as pd
import numpy as np
import scipy as sp
import re
import nltk
nltk.download('popular')
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from copy import deepcopy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances

import gzip

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/rutomo/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/rutomo/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/rutomo/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/rutomo/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/rutomo/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/rutomo/nltk_data...
[nltk_data]    |   Package movie_reviews is already

# Create Dataset and Remove Profanity

## Create Clickbait Dataset

In [4]:
%%time
# Load clickbait titles
with gzip.open('./data/clickbait_data.gz') as fin:
    txt_clickbait = [line.decode('utf-8') for line in fin if line != bytes('\n', 'utf-8')]
    
# Load non-clickbait titles
with gzip.open('./data/non_clickbait_data.gz') as fin:
    txt_non_clickbait = [line.decode('utf-8') for line in fin if line != bytes('\n', 'utf-8')]

CPU times: user 79.1 ms, sys: 4.81 ms, total: 83.9 ms
Wall time: 86.5 ms


In [None]:
%%time
# open profanity text list
with open('./data/profanity_list.txt') as fin:
    plist = [l.strip() for l in fin]

def remove_profanity(dataset, plist):
    count, word_list, data = 0, set(), []
    for line in dataset:
        flag = 0
        for word in plist:
            if word in line.lower():
                count += 1
                flag = 1
                word_list.add(word)
                break
        if not flag:
            data.append(line)
    return data, count, word_list

# remove titles with profanity from clickbait and nonclickbait titles
data_c, count_c, word_list_c = remove_profanity(txt_clickbait, plist)
data_nc, count_nc, word_list_nc = remove_profanity(txt_non_clickbait, plist)

print(count_c, count_nc)
print(word_list_c.union(word_list_nc))

In [200]:
# create list of dictionaries with label 1 for clickbait and 0 for nonclickbait
dl_c = np.array([{'headline':line, 'label':1} for line in data_c])
dl_nc = np.array([{'headline':line, 'label':0} for line in data_nc])
print('clickbait length:', len(dl_c), ' | non_clickbait length:', len(dl_nc))

clickbait length: 13043  | non_clickbait length: 12183


In [201]:
np.random.seed(42)
dataset_size = 12000

# Randomly sample equal data size between clickbait and non-clickbait titles
idx_c = np.random.choice(len(dl_c), size=dataset_size, replace=False)
idx_nc = np.random.choice(len(dl_nc), size=dataset_size, replace=False)

# Split clickbait and non clickbait titles into train and test set
split = int(len(idx_c)*0.8)
train_idx_c, test_idx_c = idx_c[:split], idx_c[split:]
train_idx_nc, test_idx_nc = idx_nc[:split], idx_nc[split:]

In [202]:
train_idx_c

array([10021,  6480, 12773, ...,   115,  1863,  3119])

In [211]:
# Combine clickbait and nonclickbait data for train and test dataset
train_data = np.concatenate((dl_c[train_idx_c], dl_nc[train_idx_nc]))
test_data = np.concatenate((dl_c[test_idx_c], dl_nc[test_idx_nc]))

# Shuffle clickbait and nonclickbait headlines
np.random.seed(42)
train_idx = np.random.choice(len(train_data), len(train_data), replace=False)
test_idx = np.random.choice(len(test_data), len(test_data), replace=False)
train_data = train_data[train_idx]
test_data = test_data[test_idx]

In [226]:
# Create test and train dataframes
df_train = pd.DataFrame(list(train_data))
df_test = pd.DataFrame(list(test_data))

# Save test and train as csv
# df_train.to_csv('./data/train.csv')
# df_test.to_csv('./data/test.csv')

# Separate dataframes into train and test lists
x_train, y_train = list(df_train['headline']), list(df_train['label'])
x_test, y_test = list(df_test['headline']), list(df_test['label'])

In [229]:
x_train[:5], y_train[:5]

(["A 65-Year-Old Man's Typewriter Was Destroyed By An Angry Cop, And The Internet Got Him A New One\n",
  'Can You Identify These United States Leaders\n',
  'Index of Economic Activity Declined in March\n',
  "2015's Best News Bloopers Are Here And They're Out Of Control\n",
  '18 Pictures Everyone Who Loves Spilling The Tea Will Understand\n'],
 [1, 1, 0, 1, 1])

In [230]:
x_test[:5], y_test[:5]

(["19 Things Anyone Who's Best Friends With Their Mum As An Adult Will Understand\n",
  '6.2 magnitude earthquake hits northern Chile\n',
  'Which Of The Great Lakes Are You\n',
  'In Loneliness, Immigrants Tend the Flock\n',
  '19 Things That Happen When You Have The Sunday Scaries\n'],
 [1, 0, 1, 0, 1])

## Create WOS Dataset

In [262]:
# wos_path = './data/WOS5736/X.txt'
wos_path = './data/WOS46985/'
wos_label = {0:'CS', 1:'ECE', 4:'Civil', 5:'Medical'}

with open(wos_path + 'YL1.txt') as fin:
    y_wos = [int(line.strip()) for line in fin]

with open(wos_path +'X.txt') as fin:
    txt_wos = [line for line in fin]

In [263]:
# open profanity text list
with open('./data/avoid_list.txt') as fin:
    alist = [l.strip() for l in fin]
    
def detect_profanity(text, alist):
    text = text.lower().split()
    for word in alist:
        if word in text:
            print('found')
            return True, word
    return False, None

In [264]:
%%time
data2_wos = {0:[], 1:[], 4:[], 5:[]}
data2_wos = {i:[] for i in wos_label.keys()}
idx2_wos = {i:[] for i in wos_label.keys()}
wos2_count = {i:0 for i in wos_label.keys()}

for i in range(len(y_wos)):
    currlabel = y_wos[i]
#     print(i)
    if currlabel in wos2_count.keys() and wos2_count[currlabel] < 500:
        p_found, word = detect_profanity(txt_wos[i], plist)
        if not p_found:
            data2_wos[currlabel].append(txt_wos[i])
            idx2_wos[currlabel].append(i + 1)
            data2_wos[currlabel]
            label2_wos.append(currlabel)
            wos2_count[currlabel] += 1
        else:
            print('profanity detected at i:', i, 'word:', word, 'label:', wos_label[currlabel] )

found
profanity detected at i: 21 word: jerk label: CS
found
profanity detected at i: 25 word: oral label: Medical
found
profanity detected at i: 31 word: slope label: ECE
found
profanity detected at i: 50 word: woody label: CS
found
profanity detected at i: 52 word: sexual label: Medical
found
profanity detected at i: 64 word: scrotum label: Medical
found
profanity detected at i: 65 word: sex label: Medical
found
profanity detected at i: 66 word: ovary label: Medical
found
profanity detected at i: 67 word: sex label: Medical
found
profanity detected at i: 68 word: gonads label: Medical
found
profanity detected at i: 93 word: penetration label: Civil
found
profanity detected at i: 96 word: oral label: Medical
found
profanity detected at i: 101 word: sexual label: Medical
found
profanity detected at i: 116 word: strip label: ECE
found
profanity detected at i: 143 word: facial label: Medical
found
profanity detected at i: 217 word: slave label: ECE
found
profanity detected at i: 246 word

found
profanity detected at i: 3550 word: strip label: Civil
found
profanity detected at i: 3667 word: strip label: Civil
found
profanity detected at i: 4074 word: wang label: Civil
found
profanity detected at i: 4285 word: slope label: Civil
found
profanity detected at i: 9471 word: slope label: Civil
CPU times: user 3 s, sys: 77.1 ms, total: 3.08 s
Wall time: 3.13 s


In [223]:
wos2_count

{0: 500, 1: 500, 4: 500, 5: 500}

In [224]:
for lbl in wos2_count.keys():
    for i in range(3):
        print(wos_label[lbl], idx2_wos[lbl][i])
        print(data2_wos[lbl][i])

CS 1
(2 + 1)-dimensional non-linear optical waves through the coherently excited resonant medium doped with the erbium atoms can be described by a (2 + 1)-dimensional non-linear Schrodinger equation coupled with the self-induced transparency equations. For such a system, via the Hirota method and symbolic computation, linear forms, one-, two-and N-soliton solutions are obtained. Asymptotic analysis is conducted and suggests that the interaction between the two solitons is elastic. Bright solitons are obtained for the fields E and P, while the dark ones for the field N, with E as the electric field, P as the polarization in the resonant medium induced by the electric field, and N as the population inversion profile of the dopant atoms. Head-on interaction between the bidirectional two solitons and overtaking interaction between the unidirectional two solitons are seen. Influence of the averaged natural frequency. on the solitons are studied: (1). can affect the velocities of all the sol

In [265]:
split = int(500*0.8)
x_wos_train = np.concatenate([data2_wos[i][:split] for i in wos_label.keys()])
y_wos_train = np.concatenate([np.ones(500, dtype=int)[:split]*i for i in wos_label.keys()])
x_wos_test = np.concatenate([data2_wos[i][split:] for i in wos_label.keys()])
y_wos_test = np.concatenate([np.ones(500,dtype=int)[split:]*i for i in wos_label.keys()])

wos_train = np.vstack((x_wos_train, y_wos_train)).T
wos_test = np.vstack((x_wos_test, y_wos_test)).T

In [266]:
wos_train.shape, wos_test.shape

((1600, 2), (400, 2))

In [279]:
train_wos = np.array([{'article':wos_train[i][0], 'label':wos_train[i][1], 'domain':wos_label[int(wos_train[i][1])]} \
             for i in range(wos_train.shape[0])])

test_wos = np.array([{'article':wos_test[i][0], 'label':wos_test[i][1], 'domain':wos_label[int(wos_test[i][1])]} \
             for i in range(wos_test.shape[0])])

In [280]:
train_wos[1]

{'article': '(Objective) In order to increase classification accuracy of tea-category identification (TCI) system, this paper proposed a novel approach. (Method) The proposed methods first extracted 64 color histogram to obtain color information, and 16 wavelet packet entropy to obtain the texture information. With the aim of reducing the 80 features, principal component analysis was harnessed. The reduced features were used as input to generalized eigenvalue proximal support vector machine (GEPSVM). Winner-takes-all (WTA) was used to handle the multiclass problem. Two kernels were tested, linear kernel and Radial basis function (RBF) kernel. Ten repetitions of 10-fold stratified cross validation technique were used to estimate the out-of-sample errors. We named our method as GEPSVM + RBF + WTA and GEPSVM + WTA. (Result) The results showed that PCA reduced the 80 features to merely five with explaining 99.90% of total variance. The recall rate of GEPSVM + RBF + WTA achieved the highest

In [281]:
# Shuffle clickbait and nonclickbait headlines
np.random.seed(42)
train_idx_wos = np.random.choice(len(train_wos), len(train_wos), replace=False)
test_idx_wos = np.random.choice(len(test_wos), len(test_wos), replace=False)
train_wos = train_wos[train_idx_wos]
test_wos = test_wos[test_idx_wos]

In [282]:
train_wos[1]

{'article': 'An automatized procedure for the parameterization of fundamental equations of state (EOS) that are explicit in terms of the Helmholtz energy and are based on molecular simulation data is presented. The simulation runs are carried out via a cloud-based framework that combines multiple, distributed computing resources. A user-friendly graphical user interface ensures that minimal knowledge about the background operations is required. In order to exemplify the capabilities of this approach an EOS for ethylene oxide is created and compared to data from the literature. (C) 2016 Elsevier B.V. All rights reserved.\n',
 'domain': 'CS',
 'label': '0'}

In [284]:
# Create test and train dataframes
df_train_wos = pd.DataFrame(list(train_wos))
df_test_wos = pd.DataFrame(list(test_wos))

# Save test and train as csv
# df_train_wos.to_csv('./data/train_wos.csv')
# df_test_wos.to_csv('./data/test_wos.csv')

# Separate dataframes into train and test lists
x_train_wos, y_train_wos = list(df_train_wos['article']), list(df_train_wos['label'])
x_test_wos, y_test_wos = list(df_test_wos['article']), list(df_test_wos['label'])

# Load Dataset

In [3]:
###############################
### DO NOT CHANGE THIS CELL ###
###############################

df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

# Separate dataframes into train and test lists
x_train, y_train = list(df_train['headline']), list(df_train['label'])
x_test, y_test = list(df_test['headline']), list(df_test['label'])

In [4]:
###############################
### DO NOT CHANGE THIS CELL ###
###############################

# Save test and train as csv
df_train_wos = pd.read_csv('./data/train_wos.csv')
df_test_wos = pd.read_csv('./data/test_wos.csv')

# Separate dataframes into train and test lists
x_train_wos, y_train_wos = list(df_train_wos['article']), list(df_train_wos['label'])
x_test_wos, y_test_wos = list(df_test_wos['article']), list(df_test_wos['label'])