In [1]:
import os
import glob

For most of what we want to do in this walkthrough we’ll only need our reviews to be in a Python list. Make sure to point open to the directory where you put the train and tes .txt files.

In [2]:
reviews_train=[]
for line in open('C:/Users/R&B/Desktop/Projects/NLP/full_train.txt','r',errors='ignore'):
    reviews_train.append(line.strip())
reviews_test = []
for line in open('C:/Users/R&B/Desktop/Projects/NLP/full_test.txt','r',errors='ignore'):
    reviews_test.append(line.strip())

Clean and Preprocess-
The raw text is pretty messy for these reviews so before we can do any analytics we need to clean things up.
Note: Understanding and being able to use regular expressions is a prerequisite for doing any Natural Language Processing task.

In [3]:
import re

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

###### In order for this data to make sense to our machine learning algorithm we’ll need to convert each review to a numeric representation, which we call vectorization.The simplest form of this is to create one very large matrix with one column for every unique word in your corpus (where the corpus is all 50k reviews in our case). Then we transform each review into one row containing 0s and 1s, where 1 means that the word in the corpus corresponding to that column appears in that review. That being said, each row of the matrix will be very sparse (mostly zeros). This process is also known as one hot encoding.

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

In [5]:
X.shape

(25000, 93448)

# Building the Classifier
1)Logistic Regression
Logistic Regression is a good baseline model for us to use for several reasons: 
    (1) They’re easy to interpret, 
    (2) linear models tend to perform well on sparse datasets like this one, and 
    (3) they learn very fast compared to other algorithms.

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
target = [1 if i < 12500 else 0 for i in range(25000)]

X_train,X_val,y_train,y_val = train_test_split(X, target, train_size=0.75)

for c in [0.01,0.05,0.25,0.5,1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train,y_train)
    print("the accuracy for C={} is {}".format(c,accuracy_score(y_val,lr.predict(X_val))))
    



the accuracy for C=0.01 is 0.87232
the accuracy for C=0.05 is 0.8816
the accuracy for C=0.25 is 0.8792
the accuracy for C=0.5 is 0.87696
the accuracy for C=1 is 0.87472


In [7]:
#using c=0.05,lets train on the final model
final_lr = LogisticRegression(C=0.05)
final_lr.fit(X,target)
print("Final accuracy is {}%".format(100*accuracy_score(target,final_lr.predict(X_test))))

Final accuracy is 88.16000000000001%


In [8]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_lr.coef_[0]
    )
}
#top 5 positive words in reviews
for top_5_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (top_5_positive)

('excellent', 0.9290941050052566)
('perfect', 0.79207571371563)
('great', 0.6748664134156535)
('amazing', 0.6129639670184123)
('superb', 0.6035024496434719)


In [9]:
#Top 5 negative words in reviews
for top_5_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (top_5_negative)

('worst', -1.364220770025034)
('waste', -1.1657048340023521)
('awful', -1.0311543744907605)
('poorly', -0.8725824713592671)
('boring', -0.8583319465495052)


In [10]:
#import nltk
#nltk.download()


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
showing info http://nltk.org/nltk_data/


True

Text PreProcessing

In [12]:
import nltk
from nltk.corpus import stopwords
english_stop_words = set(stopwords.words('english'))
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

no_stop_words = remove_stop_words(reviews_train_clean)

In [13]:
def get_stemmed_text(corpus):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

stemmed_reviews = get_stemmed_text(reviews_train_clean)

In [16]:
def get_lemmatized_text(corpus):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_reviews = get_lemmatized_text(reviews_train_clean)

In [18]:
#running the Model with N-grams(thats a sequence of words)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.88512
Accuracy for C=0.05: 0.8928
Accuracy for C=0.25: 0.896
Accuracy for C=0.5: 0.89648
Accuracy for C=1: 0.8968


In [19]:
final_ngram = LogisticRegression(C=0.5)
final_ngram.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_ngram.predict(X_test)))

Final Accuracy: 0.89768


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

wc_vectorizer = CountVectorizer(binary=False)
wc_vectorizer.fit(reviews_train_clean)
X = wc_vectorizer.transform(reviews_train_clean)
X_test = wc_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75, 
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.87712
Accuracy for C=0.05: 0.88144
Accuracy for C=0.25: 0.87936
Accuracy for C=0.5: 0.87632
Accuracy for C=1: 0.8736


In [20]:
final_wc = LogisticRegression(C=0.05)
final_wc.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_wc.predict(X_test)))

Final Accuracy: 0.89712


In [21]:
#Using TF-idf i.e. term frequency -inverse document frequency.Please read the readme file.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(reviews_train_clean)
X = tfidf_vectorizer.transform(reviews_train_clean)
X_test = tfidf_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.79728
Accuracy for C=0.05: 0.83456
Accuracy for C=0.25: 0.86624
Accuracy for C=0.5: 0.87696
Accuracy for C=1: 0.88544


In [22]:
final_tfidf = LogisticRegression(C=1)
final_tfidf.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_tfidf.predict(X_test)))



Final Accuracy: 0.8824


# Building a Support Vector Machine Model

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.01: 0.89168
Accuracy for C=0.05: 0.8896
Accuracy for C=0.25: 0.88992




Accuracy for C=0.5: 0.8904
Accuracy for C=1: 0.89008


In [24]:
final_svm_ngram = LinearSVC(C=0.01)
final_svm_ngram.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_svm_ngram.predict(X_test)))

Final Accuracy: 0.89704
