<a href="https://colab.research.google.com/github/santoshhulbutti/ML_Concepts_Practice/blob/main/SKLearn_models_learning_session_Model_improvisation_Rev02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import random

class Sentiment:
  NEGATIVE = "NEGATIVE"
  NEUTRAL = "NEUTRAL"
  POSITIVE = "POSITIVE"

class Review:
  def __init__(self, text, score):
    self.text = text
    self.score = score
    self.sentiment = self.get_sentiment()
  
  def get_sentiment(self):
    if self.score <= 2:
      return Sentiment.NEGATIVE
    elif self.score ==3:
      return Sentiment.NEUTRAL
    else: #if score if 4 or 5
      return Sentiment.POSITIVE
  
class ReviewContainer:
  def __init__(self, reviews):
    self.reviews = reviews

  def get_text(self):
    return [x.text for x in self.reviews]

  def get_sentiment(self):
    return [x.sentiment for x in self.reviews]

  def evenly_distribute(self):
    negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
    positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
    neutral = list(filter(lambda x: x.sentiment == Sentiment.NEUTRAL, self.reviews))

    positive_shrunk = positive[:len(negative)]
    neutral_shrunk = neutral[:len(negative)]

    self.reviews = negative + positive_shrunk + neutral_shrunk
    random.shuffle(self.reviews)

    # print(negative[0].text)
    # print(len(negative))
    # print(len(positive))
    # print(len(neutral))



Cleaning data

In [3]:
import json
file_name = './sample_data/Books_small_10000.json'

reviews = []
with open(file_name) as f:
   for line in f:
     print(line)
     break

{"reviewerID": "A1F2H80A1ZNN1N", "asin": "B00GDM3NQC", "reviewerName": "Connie Correll", "helpful": [0, 0], "reviewText": "I bought both boxed sets, books 1-5.  Really a great series!  Start book 1 three weeks ago and just finished book 5.  Sloane Monroe is a great character and being able to follow her through both private life and her PI life gets a reader very involved!  Although clues may be right in front of the reader, there are twists and turns that keep one guessing until the last page!  These are books you won't be disappointed with.", "overall": 5.0, "summary": "Can't stop reading!", "unixReviewTime": 1390435200, "reviewTime": "01 23, 2014"}



Loadning data

In [4]:
with open(file_name) as f:
   for line in f:
     review = json.loads(line)
     reviews.append(Review(review['reviewText'], review['overall']))

Prep Data

In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size = 0.25, random_state = 27)

train_container = ReviewContainer(train)
test_container = ReviewContainer(test)

In [6]:
train_container.evenly_distribute()
test_container.evenly_distribute()

train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

bag of words vectorization

In [7]:
#using different vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

In [8]:
print(train_x[0])
print(train_x_vectors[0].toarray())
# print(train_x_vectors[0])

Awesome book couldn't put it down. I didn't want it to end a great author who know how to keep the reader involved
[[0. 0. 0. ... 0. 0. 0.]]


## Classification

Linear Support Vector Machines

In [9]:
from sklearn import svm
clf_svm = svm.SVC(kernel ='linear')
clf_svm.fit(train_x_vectors, train_y)

SVC(kernel='linear')

In [10]:
test_x[0]

"Easy, quick, happy ending book.  Different type of story.  Needed books for a long flight.  I'm usually a picky reader, but this kept my interest.  The reason for 3 stars is because of some of the language and detail to describe sex was unnecessary and a little too much."

In [11]:
clf_svm.predict(test_x_vectors[0])

array(['NEUTRAL'], dtype='<U8')

Decision tree

In [12]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

DecisionTreeClassifier()

In [13]:
clf_dec.predict(test_x_vectors[0])

array(['NEUTRAL'], dtype='<U8')

Naive Bayes

In [14]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(), train_y)

GaussianNB()

In [15]:
clf_gnb.predict(test_x_vectors[0].toarray())

array(['NEUTRAL'], dtype='<U8')

Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(solver = 'newton-cg')
clf_log.fit(train_x_vectors, train_y)

LogisticRegression(solver='newton-cg')

In [17]:
clf_log.predict(test_x_vectors[0])

array(['NEUTRAL'], dtype='<U8')

### EVALUATION  

mean accuracy

In [18]:
# for Linear Support Vector Machines
clf_svm.score(test_x_vectors, test_y)

0.6431623931623932

In [19]:
# for Decision tree classifier
clf_dec.score(test_x_vectors, test_y)

0.46153846153846156

In [20]:
# for Naive bayes - GuassianNB
clf_gnb.score(test_x_vectors.toarray(), test_y)

0.4465811965811966

In [21]:
# for logistic regression - newton-cg solver
clf_log.score(test_x_vectors, test_y)

0.6773504273504274

## F1 Score

In [22]:
from sklearn.metrics import f1_score


print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_gnb.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

[0.73684211 0.54368932 0.64473684]
[0.49529781 0.43949045 0.44884488]
[0.46357616 0.4137931  0.46853147]
[0.75       0.59210526 0.68243243]


In [23]:
print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEUTRAL))
print(train_y.count(Sentiment.NEGATIVE))
print("__")
print(" ")
print(test_y.count(Sentiment.POSITIVE))
print(test_y.count(Sentiment.NEUTRAL))
print(test_y.count(Sentiment.NEGATIVE))
print("__")
print(" ")
print(" Total observations : ", train_y.count(Sentiment.POSITIVE) + train_y.count(Sentiment.NEUTRAL) + train_y.count(Sentiment.NEGATIVE) + test_y.count(Sentiment.POSITIVE) + test_y.count(Sentiment.NEUTRAL) + test_y.count(Sentiment.NEGATIVE))


488
488
488
__
 
156
156
156
__
 
 Total observations :  1932
