In [23]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
import random
pd.options.display.float_format = '{:,}'.format
random.seed(2021)

In [7]:
def load_dataset(file_name):
    classes = []
    texts = []
    n_review = 0
    print('current line: ', end='')
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            if n_review % 1000000 == 0:
                print(n_review, end=',')
            n_review += 1
            for label, sentence in d['review_sentences']:
                #//n_sentence += 1
                #n_spoiler_sentence += _t
                classes.append(label)
                texts.append(sentence)
            #n_spoiler_review += int(d['has_spoiler'])
            #book_set.add(d['book_id'])
            #user_set.add(d['user_id'])
    print('complete')
    print('done!')
    df = pd.DataFrame({'class' : classes, 'text' : texts})
    return df

In [8]:
df = load_dataset('goodreads_reviews_spoiler.json.gz')

current line: 0,1000000,complete
done!


In [9]:
df.head()

Unnamed: 0,class,text
0,0,This is a special book.
1,0,"It started slow for about the first third, the..."
2,0,This is what I love about good science fiction...
3,0,"It is a 2015 Hugo winner, and translated from ..."
4,0,For instance the intermixing of Chinese revolu...


In [10]:
len(df)

17672655

In [11]:
df.to_csv('csv_datasets/goodreads/goodreads_all.csv', index=False)

# Balancing

In [34]:
df_non = df[df['class'] == 0]
df_spoiler = df[df['class'] == 1]
print("nonspoiler:", len(df_non))
print("spoiler:", len(df_spoiler))
df_non = df_non.sample(frac =.05)
print("weight-decreased nonspoiler:", len(df_non))
print("balanced total:", len(df_non)+len(df_spoiler))

nonspoiler: 17102931
spoiler: 569724
weight-decreased nonspoiler: 855147
balanced total: 1424871


# train test split

In [29]:
# 70/10/20 train/val/test
df_train_non = df_non.sample(frac = .70)
df_non = df_non.drop(df_train_non.index)
df_train_spoiler = df_spoiler.sample(frac = .70)
df_spoiler = df_spoiler.drop(df_train_spoiler.index)

# use 33% of the remaining for validation (which is 10% of whole dataset)
df_dev_non = df_non.sample(frac = .333)
df_non = df_non.drop(df_dev_non.index)
df_dev_spoiler = df_spoiler.sample(frac = .333)
df_spoiler = df_spoiler.drop(df_dev_spoiler.index)

# use all of the remaining for test (which is 20% of whole dataset)
df_test_non = df_non
df_test_spoiler = df_spoiler

In [35]:
# merge and shuffle

df_train = pd.concat([df_train_non, df_train_spoiler])
df_train = df_train.sample(frac=1).reset_index(drop=True)

df_dev = pd.concat([df_dev_non, df_dev_spoiler])
df_dev = df_dev.sample(frac=1).reset_index(drop=True)

df_test = pd.concat([df_test_non, df_test_spoiler])
df_test = df_test.sample(frac=1).reset_index(drop=True)

print("train:",len(df_train))
print("dev:",len(df_dev))
print("test:",len(df_test))
print("all:",len(df_train)+len(df_dev)+len(df_test))

train: 997410
dev: 142344
test: 285117
all: 1424871


In [36]:
df_train.to_csv('csv_datasets/goodreads/train.csv', index=False)
df_dev.to_csv('csv_datasets/goodreads/dev.csv', index=False)
df_test.to_csv('csv_datasets/goodreads/test.csv', index=False)

# sample to try

In [37]:
df_train_sample = df_train.sample(7000)
df_dev_sample = df_dev.sample(1000)
df_test_sample = df_test.sample(2000)

In [39]:
df_train_sample.to_csv('csv_datasets/goodreads/sample/train.csv', index=False)
df_dev_sample.to_csv('csv_datasets/goodreads/sample/dev.csv', index=False)
df_test_sample.to_csv('csv_datasets/goodreads/sample/test.csv', index=False)