### TEXT CLASSIFICATION MODEL

Text classification is the process of classifying text strings or documents into different categories, depending upon the contents of the strings.

### Importing the libraries

In [44]:
import numpy as np
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shashwatiswain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Reading the Data

Dataset Used: Cornell sentiment analysis dataset(polarity_dataset_v2.0) 
https://www.cs.cornell.edu/people/pabo/movie-review-data/

This dataset contains 1000 positive and 1000 negative reviews.



In [75]:
reviews = load_files('review_polarity/txt_sentoken/')

In [76]:
# Deriving the data and the positive or negative label
X, y = reviews.data, reviews.target

### Data Preprocessing

In [77]:
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()

In [82]:
corpus = []
for i in range(0, len(X)):
    #Removing non-word characters
    review = re.sub(r'\W',' ',str(X[i]))
    #Converting all characters to lower case
    review = review.lower()
    #Removing single characters
    review = re.sub(r'\s+[a-z]\s+',' ', review)
    #Removing single characters at the starting of the sentence
    review = re.sub(r'^[a-z]\s+', ' ', review)
    #Removing multiple spaces
    review = re.sub(r'\s+', ' ', review)
    # Removing prefixed 'b'
    review = re.sub(r'^b\s+', '', review)
    # Lemmatization
    review = review.split()
    review = [stemmer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

### Extracting Features using CountVectorizer (Bag of Words Model)

In [50]:
from sklearn.feature_extraction.text import CountVectorizer

In [116]:
vectorizer = CountVectorizer(max_features = 3000,
                             min_df = 5,   # minimum number of documents that should contain this feature
                             max_df = 0.7, # words that occur in a maximum of 70% of all the documents
                             stop_words = stopwords.words('english') # to remove the stop words
                            )


In [117]:
X = vectorizer.fit_transform(corpus).toarray()

In [118]:
X.shape # 2000 documents and 4000 features

(2000, 3000)

### TF-IDF Transformation of the features

In [119]:
from sklearn.feature_extraction.text import TfidfTransformer

In [120]:
transformer = TfidfTransformer()
X = transformer.fit_transform(X).toarray()
X.shape

(2000, 3000)

### Splitting the dataset into Training and Testing Sets 

In [121]:
from sklearn.model_selection import train_test_split

In [122]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


### Training Text Classification Model and Predicting Sentiment

In [123]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [124]:
classifier1 = LogisticRegression()
classifier2 = RandomForestClassifier(n_estimators=1000, random_state=0)

In [125]:
classifier1.fit(X_train, y_train)
classifier2.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=0)

In [126]:
y_pred1 = classifier1.predict(X_test)
y_pred2 = classifier2.predict(X_test)

### Evaluating the Model on the Test Data

In [127]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [128]:
print("Accuracy of LR Model", accuracy_score(y_test, y_pred1))
confusion_matrix(y_test, y_pred1)

Accuracy of LR Model 0.82


array([[158,  50],
       [ 22, 170]])

In [129]:
print("Accuracy of RF Model", accuracy_score(y_test, y_pred2))

confusion_matrix(y_test, y_pred2)

Accuracy of RF Model 0.8325


array([[176,  32],
       [ 35, 157]])