In [1]:
#--Text Classification using Amazon review dataset--#

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
import pandas

In [2]:
#DATASET PREPARATION
# load the dataset 
data = open('corpus.txt',encoding="utf8").read() 
labels, texts = [], [] 
for i, line in enumerate(data.split("\n")): 
    content = line.split() 
    labels.append(content[0]) 
    texts.append(" ".join(content[1:]))

In [3]:
# create a dataframe using texts and lables 
trainDF = pandas.DataFrame() 
trainDF['text'] = texts 
trainDF['label'] = labels 

In [4]:
# split the dataset into training and validation datasets  
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label']) 

In [22]:
# label encode the target variable  
encoder = preprocessing.LabelEncoder() 
train_y = encoder.fit_transform(train_y) 
valid_y = encoder.fit_transform(valid_y) 

In [6]:
#FEATURE ENGINEERING
#COUNT VECTORS AS FEATURES
# create a count vectorizer object  
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}') 
count_vect.fit(trainDF['text']) 

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)

In [7]:
# transform the training and validation data using count vectorizer object 
xtrain_count =  count_vect.transform(train_x) 
xvalid_count =  count_vect.transform(valid_x)

In [8]:
#TF-IDF VECTORS AS FEATURES
# word level tf-idf 
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000) 
tfidf_vect.fit(trainDF['text']) 
xtrain_tfidf =  tfidf_vect.transform(train_x) 
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [9]:
#MODEL BUILDING
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False): 
    # fit the training dataset on the classifier 
    classifier.fit(feature_vector_train, label) 
    # predict the labels on validation dataset 
    predictions = classifier.predict(feature_vector_valid) 
    if is_neural_net: 
        predictions = predictions.argmax(axis=-1) 
    return metrics.accuracy_score(predictions, valid_y) 

In [10]:
#NAIVE BAYES
# Naive Bayes on Count Vectors 

accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count) 
print ("NB, Count Vectors: ", accuracy )

NB, Count Vectors:  0.8328


In [11]:
# Naive Bayes on Word Level TF IDF Vectors 

accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf) 
print ("NB, WordLevel TF-IDF: ", accuracy )

NB, WordLevel TF-IDF:  0.838


In [12]:
#LINEAR CLASSIFIER
# Linear Classifier on Count Vectors 

accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count) 
print ("LR, Count Vectors: ", accuracy )



LR, Count Vectors:  0.8532


In [13]:
# Linear Classifier on Word Level TF IDF Vectors 
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf) 
print ("LR, WordLevel TF-IDF: ", accuracy )

LR, WordLevel TF-IDF:  0.87
