In [1]:
import numpy as np
import os

In [2]:
train_file_path = '../Dataset/IMDB/train'
test_file_path = '../Dataset/IMDB/test'
val_folder = ['pos','neg']

In [3]:
def get_data(path):
    text_list,rating = [],[]
    for folder in os.listdir(path):
        if folder in val_folder:
            file_path = os.path.join(path,folder)
            
            for file in os.listdir(file_path):
                with open(os.path.join(file_path,file),encoding='utf-8') as f:
                    txt = f.read()
                
                rat = int((file.split('.')[0]).split('_')[-1])    
                text_list.append(txt)
                rating.append(rat)
                    
    return text_list,rating

In [4]:
train,rat_train = get_data(train_file_path)
test,rat_test = get_data(test_file_path)

print(len(train),len(rat_train))
print(len(test),len(rat_test))

25000 25000
25000 25000


In [5]:
train[0]

"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly."

### Data Preprocessing

- `Tokenize with Regular Expression`

In [6]:
from nltk.tokenize import RegexpTokenizer

In [7]:
tokenizer = RegexpTokenizer("[a-zA-Z]+",)

def get_tokenize_data(data):
    
    tokenize_data = []
    for text in data:
        tokens = ' '.join(i for i in tokenizer.tokenize(text))
        tokenize_data.append(tokens)
        
    return tokenize_data

In [8]:
train_tokenize = get_tokenize_data(train)
test_tokenize = get_tokenize_data(test)

- `Remove Stopwords from Tokenized Data`

I am omitting Stopwords removal process by NLTK library. Instead, I am using NLPPREPROCESS because of its flexibility and better stopwords removal process.

You can refer to this blog post: https://towardsdatascience.com/why-you-should-avoid-removing-stopwords-aa7a353d2a52

In [9]:
from nlppreprocess import NLP

In [10]:
nlp = NLP()

def remove_stopword(data):
    
    stopword_data = []
    for text in data:
        text = text.lower()
        text = (nlp.process(text)).split()
        stopword_data.append(text)
        
    return stopword_data

In [11]:
train_stopword = remove_stopword(train_tokenize)
test_stopword = remove_stopword(test_tokenize)

In [12]:
print(train_stopword[0])

['story', 'man', 'unnatural', 'feelings', 'pig', 'starts', 'out', 'with', 'opening', 'scene', 'terrific', 'example', 'absurd', 'comedy', 'formal', 'orchestra', 'audience', 'turned', 'into', 'insane', 'violent', 'mob', 'by', 'crazy', 'chantings', 's', 'singers', 'unfortunately', 'stays', 'absurd', 'whole', 'time', 'with', 'no', 'general', 'narrative', 'eventually', 'making', 'just', 'too', 'off', 'putting', 'even', 'those', 'from', 'era', 'should', 'turned', 'off', 'cryptic', 'dialogue', 'would', 'make', 'shakespeare', 'seem', 'easy', 'third', 'grader', 'technical', 'level', 's', 'better', 'than', 'you', 'might', 'think', 'with', 'good', 'cinematography', 'by', 'future', 'great', 'vilmos', 'zsigmond', 'future', 'stars', 'sally', 'kirkland', 'and', 'frederic', 'forrest', 'can', 'seen', 'briefly']


- `Stemming or Lemmatization`

In [17]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [19]:
sb = SnowballStemmer('english')
lm = WordNetLemmatizer()

def stemming(data):
    
    stemming_data = []
    for text in dat:
        text = [sb.stem(w) for w in text]
        stemming_data.append(text)
    
    return stemming_data

def lemmatization(data):
    
    lemmatize_data = []
    for text in data:
        text = [lm.lemmatize(w) for w in text]
        lemmatize_data.append(text)
        
    return lemmatize_data

In [15]:
train_stemming = stemming(train_stopword)
test_stemming = stemming(test_stopword)

In [20]:
train_lemmatize = lemmatization(train_stopword)
test_lemmatize = lemmatization(test_stopword)

In [16]:
print(train_stemming[0])

['stori', 'man', 'unnatur', 'feel', 'pig', 'start', 'out', 'with', 'open', 'scene', 'terrif', 'exampl', 'absurd', 'comedi', 'formal', 'orchestra', 'audienc', 'turn', 'into', 'insan', 'violent', 'mob', 'by', 'crazi', 'chant', 's', 'singer', 'unfortun', 'stay', 'absurd', 'whole', 'time', 'with', 'no', 'general', 'narrat', 'eventu', 'make', 'just', 'too', 'off', 'put', 'even', 'those', 'from', 'era', 'should', 'turn', 'off', 'cryptic', 'dialogu', 'would', 'make', 'shakespear', 'seem', 'easi', 'third', 'grader', 'technic', 'level', 's', 'better', 'than', 'you', 'might', 'think', 'with', 'good', 'cinematographi', 'by', 'futur', 'great', 'vilmo', 'zsigmond', 'futur', 'star', 'salli', 'kirkland', 'and', 'freder', 'forrest', 'can', 'seen', 'briefli']


In [22]:
print(train_lemmatize[0])

['story', 'man', 'unnatural', 'feeling', 'pig', 'start', 'out', 'with', 'opening', 'scene', 'terrific', 'example', 'absurd', 'comedy', 'formal', 'orchestra', 'audience', 'turned', 'into', 'insane', 'violent', 'mob', 'by', 'crazy', 'chanting', 's', 'singer', 'unfortunately', 'stay', 'absurd', 'whole', 'time', 'with', 'no', 'general', 'narrative', 'eventually', 'making', 'just', 'too', 'off', 'putting', 'even', 'those', 'from', 'era', 'should', 'turned', 'off', 'cryptic', 'dialogue', 'would', 'make', 'shakespeare', 'seem', 'easy', 'third', 'grader', 'technical', 'level', 's', 'better', 'than', 'you', 'might', 'think', 'with', 'good', 'cinematography', 'by', 'future', 'great', 'vilmos', 'zsigmond', 'future', 'star', 'sally', 'kirkland', 'and', 'frederic', 'forrest', 'can', 'seen', 'briefly']


- `CountVectorizer`

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
cv = CountVectorizer()