In [1]:
import numpy as np
import os

In [2]:
train_file_path = '../Dataset/IMDB/train'
test_file_path = '../Dataset/IMDB/test'
val_folder = ['pos','neg']

In [3]:
def get_data(path):
    text_list,rating = [],[]
    for folder in os.listdir(path):
        if folder in val_folder:
            file_path = os.path.join(path,folder)
            
            for file in os.listdir(file_path):
                with open(os.path.join(file_path,file),encoding='utf-8') as f:
                    txt = f.read()
                
                rat = int((file.split('.')[0]).split('_')[-1])    
                text_list.append(txt)
                rating.append(rat)
                    
    return text_list,rating

In [4]:
train,rat_train = get_data(train_file_path)
test,rat_test = get_data(test_file_path)

print(len(train),len(rat_train))
print(len(test),len(rat_test))

25000 25000
25000 25000


### Process Data

- `Tokenize with Regular Expression`

In [5]:
from nltk.tokenize import RegexpTokenizer

In [6]:
tokenizer = RegexpTokenizer("[a-zA-Z]+",)

def get_tokenize_data(data):
    for i,text in enumerate(data):
        tokens = ' '.join(i for i in tokenizer.tokenize(text))
        data[i] = tokens
        
    return data

In [7]:
train_process_data = get_tokenize_data(train)
test_process_data = get_tokenize_data(test)

- `Remove Stopwords from Tokenized Data`

I am omitting Stopwords removal process by NLTK library. Instead, I am using NLPPREPROCESS because of its flexibility and better stopwords removal process.

You can refer to this blog post: https://towardsdatascience.com/why-you-should-avoid-removing-stopwords-aa7a353d2a52

In [8]:
from nlppreprocess import NLP

In [9]:
nlp = NLP()

def remove_stopword(data):
    for i,text in enumerate(data):
        text = text.lower()
        text = (nlp.process(text)).split()
        data[i] = text
        
    return data

In [10]:
train_process_data = remove_stopword(train_process_data)
test_process_data = remove_stopword(test_process_data)

In [13]:
print(test_process_data[0])

['once', 'again', 'mr', 'costner', 'dragged', 'out', 'movie', 'far', 'longer', 'than', 'necessary', 'aside', 'from', 'terrific', 'sea', 'rescue', 'sequences', 'which', 'there', 'are', 'very', 'few', 'just', 'not', 'care', 'about', 'characters', 'most', 'us', 'ghosts', 'in', 'closet', 'and', 'costner', 's', 'character', 'are', 'realized', 'early', 'and', 'then', 'forgotten', 'much', 'later', 'by', 'which', 'time', 'not', 'care', 'character', 'we', 'should', 'really', 'care', 'about', 'very', 'cocky', 'overconfident', 'ashton', 'kutcher', 'problem', 'comes', 'off', 'kid', 'thinks', 's', 'better', 'than', 'anyone', 'else', 'around', 'him', 'and', 'shows', 'no', 'signs', 'cluttered', 'closet', 'his', 'only', 'obstacle', 'appears', 'winning', 'over', 'costner', 'finally', 'when', 'we', 'are', 'well', 'past', 'half', 'way', 'point', 'stinker', 'costner', 'tells', 'us', 'about', 'kutcher', 's', 'ghosts', 'we', 'are', 'told', 'kutcher', 'driven', 'best', 'with', 'no', 'prior', 'inkling', 'fore