<a href="https://colab.research.google.com/github/shvm2012/Natural-Language-Processing-essentials/blob/master/Sentiment_classification_TFIDF_vectorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Useful imports

In [0]:
# Import 'os' for preliminary tasks like directory listing etc.
import os

# Import re for regex string matching
import re

# Import nltk for nlp
import nltk

# Import library providing high-performance, easy-to-use data structures and data analysis tools
import pandas as pd

# Import Python's native data structures Counter and defaultdict
# Counter - maintains count of element
# defaultdict - dictionary data structure with exception handling for missing keys
from collections import Counter, defaultdict

# Import tqdm for fancy progressbars!
from tqdm import tqdm_notebook

# Import numpy for different mathematical operations on arrays / matrices
import numpy as np

from nltk.tokenize import word_tokenize # import tokenizer
from nltk.corpus import stopwords # import stopwords
from nltk.stem.porter import PorterStemmer #import stemmer
from nltk.stem import WordNetLemmatizer #lemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import TfidfVectorizer # for text to vector
from sklearn.naive_bayes import MultinomialNB #import naive bayes classifier
from sklearn import svm #import SVM classifier
from  sklearn.metrics  import accuracy_score # accuracy measure
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Install the nltk component for several tasks
nltk.download('punkt')     
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## NLP (nltk) basics:

In [4]:
#sentence for testing
sentence= "The quick brown fox jumps over the lazy dog"

#function to split text into word
tokens = word_tokenize(sentence)
print (tokens)

#POS_Tagging
nltk.pos_tag(tokens)

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']


[('The', 'DT'),
 ('quick', 'JJ'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('jumps', 'VBZ'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'JJ'),
 ('dog', 'NN')]

In [5]:
#stop words removal
stop_words = set(stopwords.words('english'))

print(list(stop_words)[:10]) #some common stopwords
tokens = [w for w in tokens if not w in stop_words]
print(tokens)

['both', 'yourselves', 'won', "you've", 'm', 'most', 'are', "hasn't", 'myself', 'so']
['The', 'quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']


In [6]:
# stemming
porter = PorterStemmer()
stems = []
for t in tokens:    
    stems.append(porter.stem(t))
print(stems)

['the', 'quick', 'brown', 'fox', 'jump', 'lazi', 'dog']


In [7]:
#lemmatizing
lemmatizer = WordNetLemmatizer()
lemmas=[]
for t in tokens:
  lemmas.append(lemmatizer.lemmatize(t))
print (lemmas)

print("better:", lemmatizer.lemmatize("better",pos ="a" ))

['The', 'quick', 'brown', 'fox', 'jump', 'lazy', 'dog']
better: good


## Downloading the data

In [8]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -P data/

--2019-09-19 17:11:35--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘data/aclImdb_v1.tar.gz.1’


2019-09-19 17:11:40 (18.6 MB/s) - ‘data/aclImdb_v1.tar.gz.1’ saved [84125825/84125825]



In [9]:
## Data extraction
%%time
!tar -xzf data/aclImdb_v1.tar.gz -C data/

CPU times: user 83.7 ms, sys: 11.3 ms, total: 95 ms
Wall time: 10.3 s


### Data Samples
- Dataset is split into two parts for training and testing
- Positive and negative samples are organized in individual folders 
- Each sample document is stored in a .txt file

In [0]:
#convert the dataset from files to a python DataFrame
folder = 'data/aclImdb/'
labels = {'pos': 1, 'neg': 0} 
df = pd.DataFrame()
revList = list()
for f in ('test', 'train'):    
    for l in ('pos', 'neg'):
        path = os.path.join(folder, f, l)
        for file in os.listdir (path) :
            with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
                txt = infile.read()
                revList.append((txt,labels[l]))
            #df = df.append([[txt, labels[l]]],ignore_index=True)
df = pd.DataFrame.from_records(revList)
df.columns = ['review', 'sentiment']

In [11]:
## peeking at the data
df.head()

Unnamed: 0,review,sentiment
0,Each frame in the movie is a lesson to new dir...,1
1,Ken Loach showed the world the down-and-out fl...,1
2,"It may be hard to explain how, but this film i...",1
3,Had it with the one who raised you since when ...,1
4,A photographer in the small city of Gunsan in ...,1


In [12]:
df.shape ## total 50k reviews

(50000, 2)

In [13]:
df.sentiment.value_counts() ## 25k positive reviews (label : 1) and 25k negative (label : 0)

1    25000
0    25000
Name: sentiment, dtype: int64

## Build Vocabulary


In [14]:
reviews = df.review.str.cat(sep=' ') ## storing all reviews in a tab seperated string 

65521550

In [16]:
#function to split text into word
tokens = word_tokenize(reviews)
vocabulary = set(tokens)
print(len(vocabulary)) ## number of words in the vocabulary

199783


In [17]:
## removing stopwords from vocabulary
stop_words = set(stopwords.words('english'))
vocabulary = [w for w in vocabulary if not w in stop_words]
print(len(vocabulary))

199634


## Preparing data for modeling (tfidf vectorization)



In [0]:
#building a classifier
X_train = df.loc[:24999, 'review'].values
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [19]:
sum(y_train==1) #25k training data reviews (12.5k positive & 12.5k negative)
sum(y_test==1) #25k  test data reviews (12.5k positive & 12.5k negative)

12500

In [0]:
vectorizer = TfidfVectorizer() ## for making of document(reviews)-term matrix(words)

In [21]:
train_vectors=vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
print(train_vectors.shape, test_vectors.shape)

(25000, 73822) (25000, 73822)


## Building Classifier

#### 1. Fitting Naive Bayes Classifier

In [31]:
clf = MultinomialNB().fit(train_vectors, y_train)
predicted = clf.predict(test_vectors)
print("Naive Bayes Accuracy:", np.round(accuracy_score(y_test,predicted),2))
print("Naive Bayes F1 score:", np.round(f1_score(y_test,predicted),2))

Naive Bayes Accuracy: 0.84
Naive Bayes F1 score: 0.83


#### 2. Fitting Random Forest classifier

In [34]:
clf=RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1).fit(train_vectors, y_train)
predicted = clf.predict(test_vectors)
print("Random Forest Accuracy:", np.round(accuracy_score(y_test,predicted),2))
print("Random Forest F1 score:", np.round(f1_score(y_test,predicted),2))

Random Forest Accuracy: 0.53
Random Forest F1 score: 0.66


Similarly we can fit any classifer of our choice on this IMDB dataset