In [1]:
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import unicodedata
from pandas import DataFrame

In [2]:
def unicodeToAscii(s):
    """transer to Ascii
    """
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

In [3]:
def preprocess_reviews(reviews):
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]

    return preprocess_reviews2(reviews)

def preprocess_reviews2(reviews):
    modified_reviews = []
    remove_char = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n。！，]+'
    for review in reviews:
        s = review.replace('<br />', "")  # remove line breaker

        s = re.sub(r"([.!?])", r" \1", s)
        s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)   # remove the non-character

        s = unicodeToAscii(s.lower().strip())  # change to lower case
        modified_reviews.append(s)
    return modified_reviews

In [4]:
def input_file(train, test):
    # Input training data
    reviews_train = train
    paths_train = ['data/aclImdb/train/neg', 'data/aclImdb/train/pos']
    for path in paths_train:
        files = os.listdir(path)
        for file in files:
            if not os.path.isdir(file):
                temp = file.split("_", 1)
                score = temp[1].split(".", 1)
                review = ""
                with open(path + "/" + file, 'r', encoding="utf8") as f:
                    for line in f.readlines():
                        review += line.strip()
                # reviews_train.append(' '.join([str(score[0]), review]))
                reviews_train.append(review)

    # Input training data
    reviews_test = test
    paths_test = ['data/aclImdb/test/neg', 'data/aclImdb/test/pos']
    for path in paths_test:
        files = os.listdir(path)
        for file in files:
            if not os.path.isdir(file):
                temp = file.split("_", 1)
                score = temp[1].split(".", 1)
                review = ''
                with open(path + "/" + file, 'r', encoding="utf8") as f:
                    for line in f.readlines():
                        review += line.strip()
                # reviews_test.append(' '.join([str(score[0]), review]))
                reviews_test.append(review)

    result = preprocess_reviews(train) + preprocess_reviews(test)
    return result

## Load the data and save it to single csv file

In [5]:
reviews_train = []
reviews_test = []
result = input_file(reviews_train, reviews_test)


target = ['Negative' if i < 12500 or    (25000 <= i < 37500) else 'Positive' for i in range(50000)]
index = [(i+1) for i in range(50000)]

train = {}
train['ID'] = index
train['review'] = result
train['label'] = target
df = DataFrame(train)
df.to_csv('data/allData.csv', index=False)