In [1]:
# import libraries
import json
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score

In [2]:
# Create a class to make it easier to handle the data 

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    def get_sentiment(self):
        if self.score <= 2:
            return "NEGATIVE"
        elif self.score >= 4:
            return "POSITIVE"

# Load the data

In [3]:
file = "C:\\Users\\sadak\\Desktop\\datasets\\reviews.json"
reviews=[]
with open(file) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review["reviewText"], review["overall"]))
        
print("Number of reviews:",len(reviews))
print("Sample review text:",reviews[0].text)
print("Sample review score:",reviews[0].score)

Number of reviews: 100000
Sample review text: I ordered this for my wii and it is a great cable. Lets you play your came in HD so the picture is crystal clear. You can really tell the difference in HD and regular wii cord.
Sample review score: 5.0


In [4]:
print("Positive reviews:",len([i for i in reviews if (i.sentiment == "POSITIVE")]))
print("Negative reviews:",len([i for i in reviews if (i.sentiment == "NEGATIVE")]))

# There are a lot more POSITIVE labels than negative ones
# This might hamper with our model since POSITIVE is overly represented
# Let us shrink our positive dataset to atleast close to 13000

review_pos = [i for i in reviews if i.sentiment == "POSITIVE"]
review_neg = [i for i in reviews if i.sentiment == "NEGATIVE"]
review_pos = review_pos[:13000]

reviews = review_neg + review_pos
random.shuffle(reviews)

print()
print("AFTER REDUCING POSITIVES")
print("Positive reviews:",len([i for i in reviews if (i.sentiment == "POSITIVE")]))
print("Negative reviews:",len([i for i in reviews if (i.sentiment == "NEGATIVE")]))

# Much better now

Positive reviews: 75505
Negative reviews: 12346

AFTER REDUCING POSITIVES
Positive reviews: 13000
Negative reviews: 12346


# Data preprocessing

In [5]:
# split dataset
train, test = train_test_split(reviews, test_size=0.2, random_state=42)

In [6]:
X_train = [x.text for x in train]
y_train = [y.sentiment for y in train]

X_test = [x.text for x in test]
y_test = [y.sentiment for y in test]

#### Bag of words vectorization

In [7]:
# ML models work best with numerical data
# We will convert it into a bag of words vector to pass to the model

vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)

# This will come in handy later when testing
X_test_vectors = vectorizer.transform(X_test)

## Fitting models to our data

In [8]:
# The parameters for the models were chosen after running GridSearchCV

lr = LogisticRegression(C=1, solver='saga')
svc = LinearSVC(C=0.1)
xgb = XGBClassifier()

# Voting classifier

voting_clf = VotingClassifier(estimators=[('lr', lr), ('xgb', xgb), 
                                          ('svc', svc)])

In [9]:
voting_clf.fit(X_train_vectors, y_train)





VotingClassifier(estimators=[('lr', LogisticRegression(C=1, solver='saga')),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            enable_categorical=False,
                                            gamma=None, gpu_id=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                  

In [10]:
y_pred = voting_clf.predict(X_test_vectors)
print("Accuracy score:", accuracy_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred, average=None, labels=["POSITIVE", "NEGATIVE"]))

# Pretty good accuracy!

Accuracy score: 0.878698224852071
F1 score: [0.88157135 0.87568223]


In [11]:
# We can play with the model and see how it classifies our examples

test = ["this is the best product ever", "it did not work as expected, waste of money","low quality product"]
voting_clf.predict(vectorizer.transform(test))

# Not too shabby, seems to work quite well!!

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')