In [1]:
# 1.0 Call library
import os
import sys 
import time

In [2]:
# 1.1 Array and data-manipulation libraries
import numpy as np
import pandas as pd

In [3]:
# 1.2 sklearn modeling libraries
# 1.2.1 For calculating tf-idf values
#       https://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [4]:
# 1.3 For stemming words
from nltk.stem.porter import PorterStemmer

In [5]:
# 1.4 Text processing
# 1.4.1 Import 're' module for regular expression matching
import re

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# 1.5 Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB

In [7]:
path_neg = "/home/ashok/.keras/datasets/trainSentimentAnalysis/neg"
path_pos = "/home/ashok/.keras/datasets/trainSentimentAnalysis/pos"
path_neg,path_pos

('/home/ashok/.keras/datasets/trainSentimentAnalysis/neg',
 '/home/ashok/.keras/datasets/trainSentimentAnalysis/pos')

In [8]:
def readFiles(fileList):
    lines = []
    # 4.1.1 For every file in the directory:
    for i in fileList:
        # 4.1.2 Open it and coalesce into one string
        text_file = open(i, "r",  encoding="utf8")
        tx = text_file.readlines()
        tx = " ".join(tx)
        lines.append(tx)
    return(lines)

In [9]:
# 3.0 Read all files in 'neg' folder
os.chdir(path_neg)
neg_fileList = os.listdir()
neg_files_list = readFiles(neg_fileList)
len(neg_files_list )

1000

In [10]:
len(neg_files_list )

1000

# TODO:
#a1  Clean files
#a2  Stem files
#a3  USe TFIDFVectorizer class of sklearn to create a sparse matrix of 2000 X 10000. That is take 10000 most frequent words
#a4 Apply PCA to reduce number of columns
#a5 USe train_test split to split dataset into X_train, y_train and X_test, y_test
#a6 Perform RandomForest modeling and test accuracy of sentiment


In [11]:
#a1 Clean files
def cleanTxt(listOfStrings):
    lines = []
    for tx in listOfStrings:
        # 4.3.1 Clean each string through a series
        #     of cleaning operations
        clean = re.compile('<.*?>')
        tx = re.sub(clean, '', tx)
        # 4.3.2 Replace bracketed numbers with space
        tx= re.sub(r'[\[0-9\]]',' ', tx)
        tx= re.sub('\n',' ', tx)
        tx= re.sub('\'s',' ', tx)
        tx= re.sub('www', '', tx) 
        tx= re.sub('com', '', tx)
        # 4.3.3 Replace URLs
        tx = re.sub(r'^https?:\/\/.*[\r\n]*', '', tx, flags=re.MULTILINE)
        tx = re.sub('[*|\(\)\{\}]', " ",tx)
        tx = re.sub('[=]*', "",tx)
        # 4.3.4 Replace other tags generally part of a web-file
        clean = re.compile('&lt;')
        tx = re.sub(clean, '', tx)
        clean = re.compile('&gt;')
        tx = re.sub(clean, '', tx)
        clean = re.compile('&quot;')
        tx = re.sub(clean, '', tx)
        lines.append(tx)
    return lines

In [12]:
# 6. Stemming text
#    Instantiate PorterStemmer object
porter_stemmer = PorterStemmer()

In [13]:
# Get all the lines for Negative comments in the IMDB Review
linesNeg= []                              # Start with empty list
for i in neg_fileList:
    text_file = open(i, "r",  encoding="utf8") # Open text file
    tx = text_file.readlines()           # Read all its lines
    tx = " ".join(tx)       
    tx = cleanTxt(tx)
    #tx = stemming_tokenizer(tx)
    linesNeg.append(tx)


In [14]:
len(linesNeg)

1000

In [15]:
# 4.0 Read all files in 'pos' folder
os.chdir(path_pos)
pos_fileList = os.listdir()
pos_fileList[:3]
pos_files_list = readFiles(pos_fileList)
len(pos_files_list )

['cv366_10221.txt', 'cv043_15013.txt', 'cv217_28842.txt']

1000

In [16]:
# Get all the lines for Positve comments in the IMDB Review
linesPos= []                              # Start with empty list
for i in pos_fileList:
    text_file = open(i, "r",  encoding="utf8") # Open text file
    tx = text_file.readlines()           # Read all its lines
    tx = " ".join(tx)       
    tx = cleanTxt(tx)
    linesPos.append(tx)

In [17]:
# 5.0 Merge the two lists
comb_files = neg_files_list + pos_files_list

In [18]:
len(comb_files)

2000

In [19]:
# 6.1 Define a function to use NLTK's PorterStemmer
def stemming_tokenizer(str_input):
    words = str_input.split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [20]:
# 7.0 Let us stem our list of string elements
stem_lines = []
for line in comb_files:
    stemmed_line = stemming_tokenizer(line)
    stemmed_line = " ".join(stemmed_line)    # Join words back into a string
    stem_lines.append(stemmed_line)

In [21]:
## 8.0 Transform text to tf-idf matrix
#  8.1 Instantiate TfidfVectorizer object
#      Instead of taking stemmed text, we will
#      take another approach below:

vec = TfidfVectorizer(use_idf=True,
                      strip_accents = 'unicode', # Remove accents during preprocessing step.
                      lowercase = True,
                      tokenizer=stemming_tokenizer,
                      max_features = 10000,   # Consider only top frequent features
                      stop_words='english'  # Remove stop-words
                      )

In [22]:
# 8.2 Use 'vec' object to transform:
comb_data = vec.fit_transform(stem_lines)
comb_data  

<2000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 463531 stored elements in Compressed Sparse Row format>

In [23]:
# 5.1 Create target labels
target = [0] * 1000 + [1] * 1000
len(target)

2000

In [24]:
dataframe = pd.DataFrame(comb_data.toarray(), columns=vec.get_feature_names())
dataframe.head()

Unnamed: 0,,!,"""",#1,#2,$1,$10,$100,$2,$20,...,zoe,zombi,zone,zoo,zooland,zoom,zorro,zucker,zwick,zwick'
0,0.0,0.018779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.24948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.041391,0.173753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.209112,0.019293,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.032417,0.01944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
type(target)

list

In [26]:
dataframe['sentiment'] = target
dataframe.shape     

(2000, 10000)

In [27]:
dataframe.head()

Unnamed: 0,,!,"""",#1,#2,$1,$10,$100,$2,$20,...,zoe,zombi,zone,zoo,zooland,zoom,zorro,zucker,zwick,zwick'
0,0.0,0.018779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.24948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.041391,0.173753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.209112,0.019293,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.032417,0.01944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
Y=pd.DataFrame(target)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
                                     dataframe, target,
                                     test_size=0.33,
                                     random_state = 32,
                                     shuffle = True    # Impt as data is in sequence
                                     )

In [30]:
X_train.head()

Unnamed: 0,,!,"""",#1,#2,$1,$10,$100,$2,$20,...,zoe,zombi,zone,zoo,zooland,zoom,zorro,zucker,zwick,zwick'
1638,0.0,0.0,0.105653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1676,0.0,0.0,0.133167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1910,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
267,0.0,0.0,0.040079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
RandomCLF = RandomForestClassifier(n_estimators = 100, n_jobs = 2,verbose=1)

In [34]:
%time RandomCLF.fit(X_train,y_train)

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.5s


CPU times: user 3.34 s, sys: 430 ms, total: 3.77 s
Wall time: 2.48 s


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    1.4s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [35]:
y_pred = RandomCLF.predict(X_test)

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.1s finished


In [37]:
len(y_pred)

660

In [38]:
RF_Accuracy = np.sum(y_pred == y_test)/len(y_test)
RF_Accuracy

0.9666666666666667