In [1]:
import gzip
import tqdm.notebook as tqdm
from collections import defaultdict
import json

f = open("../data/goodreads_reviews_spoiler.json")
lines = []
for i in tqdm.tqdm(range(1378033)):
    lines.append(json.loads(f.readline()))

HBox(children=(IntProgress(value=0, max=1378033), HTML(value='')))




In [2]:
import random
len(lines)
random.shuffle(lines)
test_size = int(len(lines)*0.2)
test_lines = lines[:test_size]
valid_lines = lines[test_size:test_size+10000]
train_lines = lines[test_size+10000:]

In [3]:
import string
from nltk.stem.porter import *

stemmer = PorterStemmer()
punctuation = set(string.punctuation)

def clean_review(sentence):
    return ''.join([c for c in sentence.lower() if c not in punctuation])

In [5]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
cachedStopWords = stopwords.words("english")

words = defaultdict(int)

for line in tqdm.tqdm(train_lines):
    for sentence in line['review_sentences']:
        sentence = clean_review(sentence[1])
        if (sentence != '') and (sentence is not None):
            for word in sentence.split():
                if word not in cachedStopWords:
                    words[word] += 1

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shaor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


HBox(children=(IntProgress(value=0, max=1092427), HTML(value='')))




Let's stem the words.

In [6]:
stem_words = defaultdict(int)
map_stem_words = {}
for word in tqdm.tqdm(list(words.keys())):
    stem_word = stemmer.stem(word)
    map_stem_words[word] = stem_word
    stem_words[stem_word] += words[word]

HBox(children=(IntProgress(value=0, max=765953), HTML(value='')))




Get the most popular 500 words.

In [20]:
counts = [(stem_words[w], w) for w in stem_words]
counts.sort()
counts.reverse()
word_bags = counts[:500]
popular_words = set([word[1]for word in word_bags])

Compute splitted reviews.

In [21]:
reviews = defaultdict(set)
for line in tqdm.tqdm(train_lines):
    book_id = line['book_id']
    sentences = line['review_sentences']
    label = line['has_spoiler']
    paragraph = ''
    for sentence in sentences:
        paragraph = paragraph + " " + clean_review(sentence[1])
    reviews[book_id].add((paragraph, label))

HBox(children=(IntProgress(value=0, max=1092427), HTML(value='')))




In [22]:
splitted_review = defaultdict(set)
for book_id in tqdm.tqdm(reviews):
    for paragraph, label in reviews[book_id]:
        new_paragraph = []
        for word in paragraph.split():
            if word not in cachedStopWords:
                new_paragraph.append(map_stem_words[word])
        splitted_review[book_id].add((" ".join(new_paragraph), label))

HBox(children=(IntProgress(value=0, max=25475), HTML(value='')))




# 0 Extract Word Count

Let's compute 

#  1 Extract DF-IIF

Let's compute DF.

In [23]:
count_word_in_book = defaultdict(int)
for book_id in tqdm.tqdm(splitted_review):
    for paragraph, _ in splitted_review[book_id]:
        all_words = set(paragraph.split())
        for word in all_words:
            if word in popular_words:
                count_word_in_book[word+"-"+book_id] += 1

DF = defaultdict(float)
for book_id in tqdm.tqdm(splitted_review):
    d_i = len(splitted_review[book_id])
    for word in popular_words:
        if (word+"-"+book_id) in count_word_in_book:
            DF[word+"-"+book_id] = count_word_in_book[word+"-"+book_id] / d_i

HBox(children=(IntProgress(value=0, max=25475), HTML(value='')))




HBox(children=(IntProgress(value=0, max=25475), HTML(value='')))




Let's compute IIF.

In [24]:
import numpy as np
IF = defaultdict(float)
epsilon = 1e-5
for book_id in tqdm.tqdm(splitted_review):
    whole_paragraph = ''
    for paragraph, _ in splitted_review[book_id]:
        whole_paragraph += ' ' + paragraph
    all_words = set(whole_paragraph.split())
    for word in all_words:
        if word in popular_words:
            IF[word] += 1

IIF = defaultdict(float)    
for word in IF:
    IIF[word] = - np.log((IF[word] + epsilon) / (len(splitted_review) + epsilon))

HBox(children=(IntProgress(value=0, max=25475), HTML(value='')))




Let's compute DF-IIF.

In [25]:
DF_IIF = defaultdict(float)
for word_book_id in tqdm.tqdm(DF):
    word, book_id = word_book_id.split("-")
    DF_IIF[word_book_id] = DF[word_book_id] * IIF[word]

HBox(children=(IntProgress(value=0, max=8511794), HTML(value='')))




In [None]:
popular_words = list(popular_words)

def feat_df_iif(words, book_id):
    feat = [0] * 500
    for word in words:
        if word in map_stem_words:
            word = map_stem_words[word]
        else:
            continue
        if word in popular_words:
            word_id = popular_words.index(word)
            feat[word_id] += DF_IIF[word+'-'+book_id]
    feat = feat + [1]
    return feat

def get_data_and_label(target_lines, feat):
    target_data = []
    target_label = []
    for line in tqdm.tqdm(target_lines):
        book_id = line['book_id']
        label = line['has_spoiler']
        whole_review = ''
        for sentence in line['review_sentences']:
            whole_review = whole_review + ' ' + clean_review(sentence[1])
        target_data.append(feat(whole_review.split(), book_id))
        target_label.append(label)
    return target_data, target_label

train_data, train_label = get_data_and_label(train_lines, feat_df_iif)
valid_data, valid_label = get_data_and_label(valid_lines, feat_df_iif)
test_data, test_label = get_data_and_label(test_lines, feat_df_iif)

HBox(children=(IntProgress(value=0, max=1092427), HTML(value='')))

In [None]:
import pickle
with open('../data/DF-IIF.json', 'w+') as f:
    json.dump({'DF':DF, 'IIF':IIF, 'DF-IIF':DF_IIF}, f)