In [47]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy
from sklearn.metrics import accuracy_score

In [48]:
# import processed data set - reviews set
df = pd.read_csv('processed-data.csv', usecols=['Sentiment', 'OriginalTweets']) 
# Dataframe of Review column value
df_Reviews = df.OriginalTweets
# Dataframe of Sentiment column value
df_Sentiment = df.Sentiment

In [49]:
# Splitting train and test data

# Splitting review data set into test and train in 80:20 ratio
Xtrain_set = df_Reviews.sample(frac=0.80, random_state=0)
Xtest_set = df_Reviews.drop(Xtrain_set.index)

# Splitting sentiments into test and train in same ratio
Ytrain_set = df_Sentiment.sample(frac=0.80, random_state=0)
Ytest_set = df_Sentiment.drop(Ytrain_set.index)

# Converting test and train set into list
lst_Xtrain_set = [str(x) for x in Xtrain_set.values]
lst_Xtest_set = [str(x) for x in Xtest_set.values]

lst_Ytrain_set = [str(x) for x in Ytrain_set.values]
lst_Ytest_set = [str(x) for x in Ytest_set.values]

In [50]:
# feature extraction # Convert a collection of text documents to a matrix of token counts # bag of words
vectorizer = CountVectorizer()
# Learn the vocabulary dictionary and return term-document matrix
counts_train_vec = vectorizer.fit_transform(lst_Xtrain_set)
# displays the number of rows (lines) and columns of number of features extracted
counts_train_vec.shape

(19204, 32638)

In [51]:
# Transform a count matrix to a normalized tf or tf-idf representation
tfidf_transformer = TfidfTransformer()
# Fit to data, then transform it, the document matrix is given as parameter.
X_train_tfidf = tfidf_transformer.fit_transform(counts_train_vec)
# displays the number of rows (lines) and number of features extracted
X_train_tfidf.shape

(19204, 32638)

In [52]:
# Labelling classification categories to numbers , encode target labels with value between 0 and n_classes-1
le = LabelEncoder()

# Fit label encoder
le.fit(Ytrain_set)

# Display all classes in the set
print(le.classes_)

['Negative' 'Neutral' 'Positive']


In [53]:
# Transform labels of train and test sets to normalized encoding.
classes_in_int_train = le.transform(Ytrain_set)
print(classes_in_int_train)

classes_in_int_test = le.transform(Ytest_set)
print(classes_in_int_test)

[1 0 0 ... 0 1 2]
[0 0 0 ... 2 2 2]


In [54]:
# Naive bayes classifier 
classifier = MultinomialNB()
# Fit Naive Bayes classifier according to X, y - here X is the tf-idf representation and y is the encoded labels of classes
classifier.fit(X_train_tfidf,classes_in_int_train) # Model is learning from input

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [55]:
# Convert to vector and count features for test data
c_test_vec = vectorizer.transform(lst_Xtest_set)
# Output is number of lines and the number of features extracted 
c_test_vec.shape

(4801, 32638)

In [56]:
# Feature extraction using TF-idf for test data
X_test_tfidf = tfidf_transformer.transform(c_test_vec)
X_test_tfidf.shape

(4801, 32638)

In [57]:
# Testing the model, input the normalised tf-idf representation
predict = classifier.predict (X_test_tfidf)
print (predict)
# Checking accuracy of the predicted values, input actual values and compare it with predicted value
accuracy = str(round(accuracy_score(classes_in_int_test, predict)*100,2))
print("The model has " + accuracy + "% accuracy.")

[0 0 0 ... 2 2 2]
The model has 66.84% accuracy.


In [58]:
# Build a text report showing the main classification metrics
# Parameters are correct target values and, estimated targets as returned by the classifier model
print (classification_report(classes_in_int_test, predict))

              precision    recall  f1-score   support

           0       0.64      0.77      0.70      1633
           1       0.76      0.52      0.61      1538
           2       0.65      0.71      0.68      1630

   micro avg       0.67      0.67      0.67      4801
   macro avg       0.68      0.67      0.66      4801
weighted avg       0.68      0.67      0.66      4801



In [1]:
# Get prediction for product reviews.
def Get_reviewPrediction (input_str_list):
    count_vector = vectorizer.transform(input_str_list)
    vector_tfidf = tfidf_transformer.transform(count_vector)
    prediction = classifier.predict(vector_tfidf)
    print (prediction)
    for i in range (len (input_str_list)):
        if (prediction [i] == 0):
            print (input_str_list [i] + " -- NEGATIVE" )
            print 
        elif (prediction [i] == 1):
            print (input_str_list [i] + " -- NEUTRAL")
            print 
        elif(prediction [i] == 2):
            print (input_str_list [i] + " -- POSITIVE")
            print 

In [None]:
sentence = raw_input("Enter sentence or list of sentences separated by commas: ").split('.')
Get_reviewPrediction (sentence)