### Sentiment Analysis
+ PAPER: https://journalofbigdata.springeropen.com/articles/10.1186/s40537-015-0015-2
+ POS TAGS: https://stackoverflow.com/questions/15388831/what-are-all-possible-pos-tags-of-nltk
+ Hash function: https://stackoverflow.com/questions/16008670/how-to-hash-a-string-into-8-digits
+ NOTES: To do the negative sentiment analysis, you need to erase all punctuation and capitalization from the DF
+ Bag of Words vectorizer: https://www.mygreatlearning.com/blog/bag-of-words/


# Preprocessing
---



### Imports

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter


nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
df = pd.read_json('/content/reviews_Musical_Instruments_5.json', lines=True)

#df = pd.read_csv('/content/reviews_Musical_Instruments_5.csv')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
df = df[['reviewText', 'overall']]

In [None]:
df['reviewText'] = df['reviewText'].str.lower()

In [None]:
df

Unnamed: 0,reviewText,overall
0,"not much to write about here, but it does exac...",5
1,the product does exactly as it should and is q...,5
2,the primary job of this device is to block the...,5
3,nice windscreen protects my mxl mic and preven...,5
4,this pop filter is great. it looks and perform...,5
...,...,...
10256,"great, just as expected. thank to all.",5
10257,i've been thinking about trying the nanoweb st...,5
10258,i have tried coated strings in the past ( incl...,4
10259,"well, made by elixir and developed with taylor...",4


In [None]:
df['reviewText'] = df['reviewText'].str.replace(r'[^\w\s]+', '')

  """Entry point for launching an IPython kernel.


### Divide dataset by star

In [None]:
df_5 = df[df['overall'] == 5]
df_4 = df[df['overall'] == 4]
df_3 = df[df['overall'] == 3]
df_2 = df[df['overall'] == 2]
df_1 = df[df['overall'] == 1]

### Analyze dataset
- We notice there is an issue of an imbalanced dataset. We will need to consider this later.



In [None]:
df_5.shape

(6938, 2)

In [None]:
df_4.shape

(2084, 2)

In [None]:
df_3.shape

(772, 2)

In [None]:
df_2.shape

(250, 2)

In [None]:
df_1.shape

(217, 2)

In [None]:
df_array = []
df_array.append(df_1)
df_array.append(df_2)
df_array.append(df_3)
df_array.append(df_4)
df_array.append(df_5)

## Methods


---


In [None]:
def tokenizer_(df):
  # holds all tokenized reviews
  reviews_tok = []
  # holds all tags of review
 
  tokenizer = nltk.RegexpTokenizer(r"\w+")

  # tokenize
  for i in df['reviewText']:  
    
    reviews_tok.append(tokenizer.tokenize(i))
      

  return reviews_tok

  

In [None]:
def POS_tagger(reviews_tok):
  
  tag = []
  # parts of speech tagging
  for i in reviews_tok:
    val = nltk.pos_tag(i)
    for j in val:
      # not sure what caused this error, but for some reason, some values of j did not return a tuple
      if len(j) == 2:
          tag.append(j)
  return tag

In [None]:
def filt(tag):
  # filter tags with no emotional meaning (only take verb, adverb, adjective)
  filtered_tag = []
  filt_list_pos = ['JJ','JJR','JJS','RB','RBR','RBS','VB','VBD','VBG','VBN','VBP','VBZ','WRB']
  temp = []
  for i in tag:
    if i[1] in filt_list_pos:
      filtered_tag.append(i)

  return filtered_tag
    

In [None]:
def tokens_prep(df):
  tokens_5 = word_tokens(df_5)
  tokens_5

In [None]:
def word_tokens(df):
  tagged = tokenizer_(df)
  tag = POS_tagger(tagged)
  filtered = filt(tag)
  return filtered

In [None]:
from collections import Counter
def most_common_tokens(tokens, df):
  tokens=word_tokens(df)
  Count = Counter(tokens)
  # take 25% of the top words
  most_occur = Count.most_common(int(len(df)*0.25))
  print(most_occur)
  common = []
  common_plus = []

  for i in most_occur:
    common.append(i[0][0])
    common_plus.append(i)
  return common, common_plus
  

In [None]:
# vectorizes a dataframe
def vectorizer(common, df):
  full_vec = []
  for j in df["reviewText"]:
   
    test_vec = []


    for i in common:
      #print(i)
      if i[0] in j:
        test_vec.append(1)
      else:
        test_vec.append(0)
    # hash the 1s and 0s vector    
    full_vec.append(hash(str(test_vec))%1000000)

  return full_vec


## Creating Token Vectors

---
There has to be a better way to do this part without making separate variables for each category. I know there is. But I'm on a deadline here so this is what we're doing.



In [None]:
tokens_arr = []
bow = []
# Output filtered tokens in each category
tokens_1 = word_tokens(df_1)
tokens1, tokens1_plus = most_common_tokens(tokens_1, df_1)
print(len(tokens1))

tokens_2 = word_tokens(df_2)
tokens2, tokens2_plus = most_common_tokens(tokens_2, df_2)
print(len(tokens2))


tokens_3 = word_tokens(df_3)
tokens3, tokens3_plus = most_common_tokens(tokens_3, df_3)
print(len(tokens3))


tokens_4 = word_tokens(df_4)
tokens4, tokens4_plus = most_common_tokens(tokens_4, df_4)
print(len(tokens4))


tokens_5 = word_tokens(df_5)
tokens5, tokens5_plus = most_common_tokens(tokens_5, df_5)
print(len(tokens5))



tokens_arr.append(tokens1)
tokens_arr.append(tokens2)
tokens_arr.append(tokens3)
tokens_arr.append(tokens4)
tokens_arr.append(tokens5)

[(('is', 'VBZ'), 311), (('not', 'RB'), 199), (('was', 'VBD'), 167), (('have', 'VBP'), 142), (('i', 'JJ'), 129), (('are', 'VBP'), 103), (('just', 'RB'), 97), (('be', 'VB'), 81), (('so', 'RB'), 78), (('very', 'RB'), 66), (('when', 'WRB'), 60), (('get', 'VB'), 57), (('really', 'RB'), 55), (('had', 'VBD'), 55), (('has', 'VBZ'), 54), (('good', 'JJ'), 53), (('even', 'RB'), 51), (('too', 'RB'), 49), (('other', 'JJ'), 48), (('got', 'VBD'), 47), (('back', 'RB'), 45), (('only', 'RB'), 45), (('have', 'VB'), 43), (('buy', 'VB'), 43), (('i', 'VBP'), 42), (('then', 'RB'), 41), (('i', 'RB'), 39), (('cheap', 'JJ'), 37), (('use', 'VB'), 37), (('i', 'VB'), 36), (('few', 'JJ'), 35), (('did', 'VBD'), 34), (('bought', 'VBD'), 34), (('well', 'RB'), 33), (('were', 'VBD'), 32), (('more', 'JJR'), 31), (('am', 'VBP'), 31), (('work', 'VB'), 30), (('also', 'RB'), 30), (('now', 'RB'), 29), (('does', 'VBZ'), 28), (('same', 'JJ'), 28), (('used', 'VBN'), 27), (('great', 'JJ'), 27), (('do', 'VBP'), 27), (('bad', 'JJ')

In [None]:
len(tokens_arr)

5

In [None]:
# flatten bag of words
common= sum(tokens_arr, [])
common


In [None]:
bow_1 = vectorizer(common, df_1)

In [None]:
len(bow_1)

217

In [None]:
arr_1=[]
for i in range(len(bow_1)):
  arr_1.append(1)

In [None]:
bow_2 = vectorizer(common, df_2)

In [None]:
arr_2=[]
for i in range(len(bow_2)):
  arr_2.append(2)

In [None]:
bow_3 = vectorizer(common, df_3)

In [None]:
arr_3=[]
for i in range(len(bow_3)):
  arr_3.append(3)

In [None]:
bow_4 = vectorizer(common, df_4)

In [None]:
arr_4=[]
for i in range(len(bow_4)):
  arr_4.append(4)

In [None]:
bow_5 = vectorizer(common, df_5)

In [None]:
arr_5=[]
for i in range(len(bow_5)):
  arr_5.append(5)

In [None]:
data={'Reviews':df_1['reviewText'],
      'Token_Vector': bow_1,
      'Rating':arr_1}
df1_dataset = pd.DataFrame(data)
df1_dataset.head()

Unnamed: 0,Reviews,Token_Vector,Rating
52,it hums crackles and i think im having problem...,413234,1
89,im a procheapo and i hated this thing theyre n...,346160,1
223,received it in time standard blister packaging...,20636,1
224,these things are terrible one wouldnt fit in m...,346160,1
408,this is a cheap piece of junk that does what i...,369805,1


In [None]:
data={'Reviews':df_2['reviewText'],
      'Token_Vector': bow_2,
      'Rating':arr_2}
df2_dataset = pd.DataFrame(data)
df2_dataset.head()

Unnamed: 0,Reviews,Token_Vector,Rating
15,i bought this to use with my keyboard i wasnt ...,20636,2
50,i didnt expect this cable to be so thin its ea...,987913,2
98,i bought this for my canon vixia hf g10 video ...,563117,2
144,i got 3 of these to plug xlr cables into the 1...,609355,2
286,the handle and spring strength make this uncom...,35838,2


In [None]:
data={'Reviews':df_3['reviewText'],
      'Token_Vector': bow_3,
      'Rating':arr_3}
df3_dataset = pd.DataFrame(data)
df3_dataset.head()

Unnamed: 0,Reviews,Token_Vector,Rating
7,i now use this cable to run from the output of...,602996,3
12,if you are not use to using a large sustaining...,731413,3
59,works for practice its a guitar instrument ca...,936646,3
71,unbalanced guitar cable is notoriously noisy e...,81221,3
74,its a cable no frills tangles pretty easy and ...,20636,3


In [None]:
len(common)

2564

In [None]:
data={'Reviews':df_4['reviewText'],
      'Token_Vector': bow_4,
      'Rating':arr_4}
df4_dataset = pd.DataFrame(data)
df4_dataset.head()


Unnamed: 0,Reviews,Token_Vector,Rating
11,i got it to have it if i needed it i have foun...,789251,4
16,this fender cable is the perfect length for me...,75053,4
20,this is a cool looking cheap cable which works...,20636,4
22,very good cable well made and it looks great w...,20636,4
25,cant go wrong great quality on a budget price ...,43320,4


In [None]:
data={'Reviews':df_5['reviewText'],
      'Token_Vector': bow_5,
      'Rating':arr_5}
df5_dataset = pd.DataFrame(data)
df5_dataset.head()


Unnamed: 0,Reviews,Token_Vector,Rating
0,not much to write about here but it does exact...,388518,5
1,the product does exactly as it should and is q...,369805,5
2,the primary job of this device is to block the...,369805,5
3,nice windscreen protects my mxl mic and preven...,413234,5
4,this pop filter is great it looks and performs...,367907,5


In [None]:
df_dataset = []

In [None]:
df_dataset = df1_dataset.append(df2_dataset)

In [None]:
df_dataset = df_dataset.append(df3_dataset)

In [None]:
df_dataset = df_dataset.append(df4_dataset)

In [None]:
df_dataset = df_dataset.append(df5_dataset)

In [None]:
df_dataset

Unnamed: 0,Reviews,Token_Vector,Rating
52,it hums crackles and i think im having problem...,413234,1
89,im a procheapo and i hated this thing theyre n...,346160,1
223,received it in time standard blister packaging...,20636,1
224,these things are terrible one wouldnt fit in m...,346160,1
408,this is a cheap piece of junk that does what i...,369805,1
...,...,...,...
10251,true to phosphor bronze these strings have a m...,563117,5
10252,ive used elixirs for about five years now this...,563117,5
10254,i really like these strings while they are no...,413234,5
10256,great just as expected thank to all,667333,5


In [None]:
df_dataset.to_csv('music_dataset.csv')

## NEGATION PHRASES

In [None]:
negation_list = ['no', 'not', 'none', 'nobody', 'nothing', 'neither', 'nowhere', 'never', 'hardly', 'scarcely', 'barely', 'doesnt', 'isnt', 'wasnt', 'shouldnt', 'wouldnt', 'couldnt', 'wont', 'cant', 'dont']
def neg_tokens(df):
  t = word_tokens(df)
  df_neg_tokens = []
  for i in range(0,len(t)-1):
    if t[i][0] in negation_list:
      df_neg_tokens.append((t[i][0],t[i+1][0]))
      
  return df_neg_tokens

In [None]:
df1_neg = neg_tokens(df_1)
df2_neg = neg_tokens(df_2)
df3_neg = neg_tokens(df_3)
df4_neg = neg_tokens(df_4)
df5_neg = neg_tokens(df_5)

In [None]:
def most_common_tokens_neg(tokens, df):
  #tokens=word_tokens(df)
  Count = Counter(tokens)
  # take 25% of the top words
  most_occur = Count.most_common(int(len(df)*0.25))
  print(most_occur)
  common = []
  common_plus = []

  for i in most_occur:
    common.append(i[0])
    common_plus.append(i)
  return common, most_occur

In [None]:
df1_MC, x1 = most_common_tokens_neg(df1_neg, df_1)


[(('not', 'work'), 9), (('not', 'use'), 8), (('not', 'buy'), 8), (('not', 'even'), 8), (('not', 'only'), 6), (('not', 'going'), 5), (('not', 'very'), 5), (('not', 'sure'), 4), (('not', 'be'), 4), (('not', 'recommend'), 4), (('not', 'i'), 4), (('never', 'seen'), 3), (('not', 'good'), 3), (('never', 'buy'), 3), (('dont', 'know'), 3), (('doesnt', 'seem'), 3), (('not', 'have'), 3), (('doesnt', 'have'), 3), (('never', 'had'), 3), (('not', 'as'), 3), (('not', 'really'), 3), (('dont', 'recommend'), 2), (('not', 'hold'), 2), (('cant', 'have'), 2), (('not', 'get'), 2), (('not', 'find'), 2), (('not', 'do'), 2), (('cant', 'get'), 2), (('not', 'fit'), 2), (('not', 'working'), 2), (('cant', 'believe'), 2), (('not', 'where'), 2), (('not', 'how'), 2), (('not', 'just'), 2), (('dont', 'even'), 2), (('not', 'maybe'), 2), (('never', 'heard'), 2), (('not', 'display'), 2), (('cant', 'is'), 2), (('not', 'appear'), 2), (('not', 'afford'), 2), (('never', 'again'), 2), (('no', 'longer'), 2), (('never', 'did'),

In [None]:
df2_MC, x2 = most_common_tokens_neg(df2_neg, df_2)

[(('not', 'recommend'), 8), (('not', 'good'), 8), (('not', 'be'), 8), (('not', 'have'), 8), (('not', 'sure'), 7), (('not', 'very'), 7), (('not', 'so'), 7), (('not', 'great'), 6), (('not', 'work'), 5), (('not', 'fit'), 5), (('not', 'enough'), 5), (('not', 'is'), 5), (('not', 'get'), 4), (('not', 'i'), 3), (('not', 'only'), 3), (('not', 'much'), 3), (('dont', 'sound'), 3), (('not', 'put'), 3), (('not', 'hold'), 3), (('not', 'happy'), 3), (('wouldnt', 'recommend'), 3), (('not', 'just'), 3), (('not', 'use'), 3), (('dont', 'really'), 3), (('not', 'going'), 3), (('not', 'even'), 3), (('doesnt', 'as'), 2), (('dont', 'particularly'), 2), (('not', 'thats'), 2), (('doesnt', 'need'), 2), (('dont', 'play'), 2), (('doesnt', 'really'), 2), (('not', 'buy'), 2), (('not', 'sound'), 2), (('dont', 'not'), 2), (('dont', 'buy'), 2), (('dont', 'get'), 2), (('doesnt', 'fit'), 2), (('not', 'too'), 2), (('not', 'break'), 2), (('wouldnt', 'be'), 2), (('not', 'catch'), 2), (('dont', 'have'), 2), (('not', 'bad'),

In [None]:
df3_MC, x3 = most_common_tokens_neg(df3_neg, df_3)

[(('not', 'sure'), 26), (('not', 'bad'), 21), (('not', 'have'), 18), (('not', 'as'), 16), (('not', 'be'), 15), (('dont', 'have'), 14), (('not', 'really'), 14), (('not', 'very'), 14), (('not', 'great'), 13), (('not', 'much'), 13), (('dont', 'know'), 12), (('not', 'so'), 11), (('not', 'recommend'), 9), (('not', 'fit'), 9), (('not', 'use'), 9), (('dont', 'think'), 8), (('never', 'had'), 8), (('not', 'good'), 8), (('not', 'get'), 8), (('not', 'is'), 8), (('not', 'i'), 8), (('dont', 'need'), 8), (('not', 'too'), 7), (('not', 'big'), 7), (('not', 'buy'), 7), (('not', 'are'), 7), (('not', 'work'), 7), (('wouldnt', 'recommend'), 7), (('cant', 'get'), 7), (('not', 'even'), 6), (('not', 'same'), 6), (('dont', 'get'), 6), (('dont', 'expect'), 6), (('not', 'enough'), 6), (('not', 'going'), 6), (('not', 'high'), 5), (('dont', 'really'), 5), (('cant', 'go'), 5), (('cant', 'really'), 5), (('dont', 'feel'), 5), (('not', 'come'), 5), (('not', 'greatest'), 5), (('dont', 'use'), 5), (('dont', 'seem'), 5)

In [None]:
df4_MC, x4 = most_common_tokens_neg(df4_neg, df_4)

[(('not', 'have'), 47), (('not', 'sure'), 44), (('not', 'be'), 41), (('dont', 'have'), 37), (('not', 'so'), 33), (('not', 'as'), 30), (('not', 'too'), 29), (('not', 'bad'), 27), (('not', 'much'), 24), (('not', 'very'), 24), (('dont', 'think'), 23), (('dont', 'know'), 23), (('not', 'quite'), 22), (('not', 'best'), 20), (('not', 'really'), 20), (('not', 'is'), 19), (('wont', 'be'), 19), (('not', 'big'), 19), (('not', 'going'), 18), (('never', 'had'), 17), (('dont', 'want'), 17), (('not', 'work'), 16), (('not', 'only'), 16), (('not', 'use'), 16), (('not', 'great'), 16), (('dont', 'need'), 15), (('not', 'i'), 14), (('cant', 'go'), 13), (('not', 'even'), 12), (('cant', 'really'), 12), (('not', 'had'), 12), (('dont', 'expect'), 11), (('doesnt', 'have'), 11), (('not', 'want'), 11), (('dont', 'really'), 10), (('not', 'good'), 10), (('cant', 'beat'), 10), (('no', 'longer'), 10), (('not', 'fit'), 10), (('not', 'being'), 10), (('not', 'get'), 9), (('not', 'hard'), 9), (('dont', 'is'), 9), (('cant

In [None]:
df5_MC, x5 = most_common_tokens_neg(df5_neg, df_5)

[(('not', 'be'), 120), (('not', 'have'), 117), (('dont', 'have'), 107), (('cant', 'go'), 87), (('not', 'sure'), 79), (('not', 'much'), 76), (('not', 'only'), 74), (('not', 'too'), 70), (('wont', 'be'), 63), (('never', 'had'), 58), (('dont', 'know'), 49), (('not', 'going'), 48), (('not', 'so'), 41), (('not', 'use'), 39), (('dont', 'need'), 37), (('dont', 'think'), 37), (('dont', 'really'), 37), (('dont', 'want'), 36), (('cant', 'beat'), 36), (('not', 'really'), 35), (('not', 'get'), 32), (('not', 'as'), 30), (('not', 'are'), 30), (('dont', 'get'), 29), (('not', 'i'), 29), (('cant', 'say'), 29), (('not', 'is'), 29), (('not', 'bad'), 28), (('not', 'just'), 28), (('not', 'very'), 27), (('never', 'have'), 27), (('no', 'longer'), 27), (('couldnt', 'be'), 26), (('not', 'big'), 26), (('not', 'had'), 25), (('not', 'even'), 24), (('cant', 'be'), 23), (('not', 'need'), 23), (('never', 'used'), 22), (('not', 'work'), 22), (('not', 'good'), 21), (('not', 'having'), 21), (('not', 'want'), 21), (('do

In [None]:
# create negative bag of words
neg_BOW = []
neg_BOW.append(df1_MC)
neg_BOW.append(df2_MC)
neg_BOW.append(df3_MC)
neg_BOW.append(df4_MC)
neg_BOW.append(df5_MC)

In [None]:
# create flat bag of words
common_neg= sum(neg_BOW, [])
common_neg


In [None]:
def POS_tagger_neg(reviews_tok):
  
  tag = []
  # parts of speech tagging
  for i in reviews_tok:
    temp = []
    #print(i)
    val = nltk.pos_tag(i)
    for j in val:
      # not sure what caused this error, but for some reason, some values of j did not return a tuple
      if len(j) == 2:
          temp.append(j)
    tag.append(temp)
  return tag

In [None]:
df1_rev_tok = tokenizer_(df_1)
df1_sent_tok = POS_tagger_neg(df1_rev_tok)

In [None]:
df1_sent_tok

[[('it', 'PRP'),
  ('hums', 'VBD'),
  ('crackles', 'NNS'),
  ('and', 'CC'),
  ('i', 'JJ'),
  ('think', 'VBP'),
  ('im', 'JJ'),
  ('having', 'VBG'),
  ('problems', 'NNS'),
  ('with', 'IN'),
  ('my', 'PRP$'),
  ('equipment', 'NN'),
  ('as', 'RB'),
  ('soon', 'RB'),
  ('as', 'IN'),
  ('i', 'JJ'),
  ('use', 'VBP'),
  ('any', 'DT'),
  ('of', 'IN'),
  ('my', 'PRP$'),
  ('other', 'JJ'),
  ('cords', 'NNS'),
  ('then', 'RB'),
  ('the', 'DT'),
  ('problem', 'NN'),
  ('is', 'VBZ'),
  ('gone', 'VBN'),
  ('hosa', 'JJ'),
  ('makes', 'VBZ'),
  ('some', 'DT'),
  ('other', 'JJ'),
  ('products', 'NNS'),
  ('that', 'WDT'),
  ('have', 'VBP'),
  ('good', 'JJ'),
  ('value', 'NN'),
  ('but', 'CC'),
  ('based', 'VBN'),
  ('on', 'IN'),
  ('my', 'PRP$'),
  ('experience', 'NN'),
  ('i', 'NN'),
  ('dont', 'VBP'),
  ('recommend', 'VB'),
  ('this', 'DT'),
  ('one', 'CD')],
 [('im', 'VB'),
  ('a', 'DT'),
  ('procheapo', 'NN'),
  ('and', 'CC'),
  ('i', 'NN'),
  ('hated', 'VBD'),
  ('this', 'DT'),
  ('thing', 'NN'),
 

In [None]:
df1_rev_tok

[['it',
  'hums',
  'crackles',
  'and',
  'i',
  'think',
  'im',
  'having',
  'problems',
  'with',
  'my',
  'equipment',
  'as',
  'soon',
  'as',
  'i',
  'use',
  'any',
  'of',
  'my',
  'other',
  'cords',
  'then',
  'the',
  'problem',
  'is',
  'gone',
  'hosa',
  'makes',
  'some',
  'other',
  'products',
  'that',
  'have',
  'good',
  'value',
  'but',
  'based',
  'on',
  'my',
  'experience',
  'i',
  'dont',
  'recommend',
  'this',
  'one'],
 ['im',
  'a',
  'procheapo',
  'and',
  'i',
  'hated',
  'this',
  'thing',
  'theyre',
  'noisy',
  'and',
  'the',
  'cables',
  'feel',
  'really',
  'cheap',
  'gummylike',
  'drop',
  'few',
  'more',
  'bucks',
  'and',
  'get',
  'something',
  'else'],
 ['received',
  'it',
  'in',
  'time',
  'standard',
  'blister',
  'packaging',
  'but',
  'the',
  'cable',
  'stopped',
  'working',
  'after',
  '45',
  'days',
  'since',
  'i',
  'was',
  'out',
  'of',
  '30',
  'days',
  'return',
  'time',
  'sent',
  'an',
  '

In [None]:
def df_tokenizer_neg(df_sent_tok):
  t=df_sent_tok

  full_vec = []
  for i in range(0, len(t)):
    vec = []
    for j in range(0, len(t[i])-1):
    
    # print('j', t[i][j])
      if t[i][j][0] in negation_list:
    
        tup = (t[i][j][0],t[i][j+1][0])
        vec.append(tup)
    full_vec.append(vec)
        #full_vec.append(vec)
        #for x in neg_BOW:
          #if tup in x:
            #print("FOUND")
  return full_vec

In [None]:
full_vec = df_tokenizer_neg(df1_sent_tok)

In [None]:
len(full_vec)

217

In [None]:
common_neg

In [None]:
def vectorize_neg(full_vec):
  f_vec = []
  for j in full_vec:
    t_vec = []
    for i in common_neg:
      
      if i in j:
        t_vec.append(1)
      else:
        t_vec.append(0)

    #f_vec.append(t_vec)
    f_vec.append(hash(str(t_vec))%1000000)
  #full_vec
  return f_vec

In [None]:
x = vectorize_neg(full_vec)

In [None]:
len(x)

217

In [None]:
def negation(df_org):
  df_rev_tok = tokenizer_(df_org)
  df_sent_tok = POS_tagger_neg(df_rev_tok)
  full_vec = df_tokenizer_neg(df_sent_tok)
  final = vectorize_neg(full_vec)

  return final


NameError: ignored

In [None]:
NEG_df1 = negation(df_1)
print(len(NEG_df1))


217


In [None]:
NEG_df2 = negation(df_2)
print(len(NEG_df2))

250


In [None]:

NEG_df3 = negation(df_3)
NEG_df4 = negation(df_4)
NEG_df5 = negation(df_5)

In [None]:
total_NEG_col = []
total_NEG_col.append(NEG_df1)
total_NEG_col.append(NEG_df2)
total_NEG_col.append(NEG_df3)
total_NEG_col.append(NEG_df4)
total_NEG_col.append(NEG_df5)

total_NEG_col_FL= sum(total_NEG_col, [])


In [None]:
df_dataset['Neg_Token_Vector']=total_NEG_col_FL

In [None]:
df_dataset

Unnamed: 0,Reviews,Token_Vector,Rating,Neg_Token_Vector
52,it hums crackles and i think im having problem...,413234,1,566754
89,im a procheapo and i hated this thing theyre n...,346160,1,712708
223,received it in time standard blister packaging...,20636,1,712708
224,these things are terrible one wouldnt fit in m...,346160,1,886937
408,this is a cheap piece of junk that does what i...,369805,1,712708
...,...,...,...,...
10251,true to phosphor bronze these strings have a m...,563117,5,745810
10252,ive used elixirs for about five years now this...,563117,5,712708
10254,i really like these strings while they are no...,413234,5,972789
10256,great just as expected thank to all,667333,5,712708


In [None]:
df_dataset.to_csv('music_dataset_w_neg.csv')

## Creating Token Score


---


In [None]:
tokens1

['is',
 'not',
 'was',
 'have',
 'i',
 'are',
 'just',
 'be',
 'so',
 'very',
 'when',
 'get',
 'really',
 'had',
 'has',
 'good',
 'even',
 'too',
 'other',
 'got',
 'back',
 'only',
 'have',
 'buy',
 'i',
 'then',
 'i',
 'cheap',
 'use',
 'i',
 'few',
 'did',
 'bought',
 'well',
 'were',
 'more',
 'am',
 'work',
 'also',
 'now',
 'does',
 'same',
 'used',
 'great',
 'do',
 'bad',
 'going',
 'new',
 'how',
 'want',
 'again',
 'dont',
 'been',
 'never']

In [None]:
tokens2_plus[0][0]

('is', 'VBZ')

### How many instances of each token in each star class?

In [None]:
#added 1 to get rid of issue
def token_counter(target_tokens_arr, flag):
  if flag == 1:
    all_token_arrs = [tokens1_plus, tokens2_plus, tokens3_plus, tokens4_plus, tokens5_plus]
  else:
    all_token_arrs = [x1, x2, x3, x4, x5]

  #token = target_tokens_arr[0][0]
  tokens1_array = []
  

  for x in target_tokens_arr:
    temp_arr = []
    temp_arr.append(x[0])
    for token_arr in all_token_arrs:
      for j in token_arr: 
        flag = False
       
        if x[0] == j[0]:
          #print('x=',x)
          temp_arr.append(j[1]+1)
          flag = True
          break
     
      if flag == False:
        temp_arr.append(1)
    tokens1_array.append(temp_arr)

  return tokens1_array

In [None]:
token_counter_1 = token_counter(tokens1_plus,1)
token_counter_2 = token_counter(tokens2_plus,1
token_counter_3 = token_counter(tokens3_plus,1)
token_counter_4 = token_counter(tokens4_plus,1)
token_counter_5 = token_counter(tokens5_plus,1)

In [None]:
x1

In [None]:
token_counter_1_neg = token_counter(x1,0)
token_counter_2_neg = token_counter(x2,0)
token_counter_3_neg = token_counter(x3,0)
token_counter_4_neg = token_counter(x4,0)
token_counter_5_neg = token_counter(x5,0)

In [None]:
token_counter_5_neg

In [None]:
token_counter_1[0][1]

312

In [None]:
k = token_counter_1[0]
k[1]


#for k in token_counter_1:
  #print("k=",k)

k = token_counter_1[2]
k

[('was', 'VBD'), 168, 173, 377, 797, 2508]

In [None]:
df_1 = df_1.append(token_counter_1_neg)
df_2 = df_2.append(token_counter_2_neg)
df_3 = df_3.append(token_counter_3_neg)
df_4 = df_4.append(token_counter_4_neg)
df_5 = df_5.append(token_counter_5_neg)

In [None]:
token_length_list = [len(df_1), len(df_2), len(df_3), len(df_4), len(df_5)]


In [None]:
token_length_list

[271, 312, 965, 2605, 8672]

In [None]:
token_counter_1[0][1]

312

In [None]:
token_counter_1[0][0+1]

312

In [None]:
#def score(i, token_count):

def score(token_counter):
  token1_scores = []
  for v in token_counter:
    k = v
    for k in token_counter:
      temp = []
      top = 0
      for x in range(1, 6):
        y = len(df_5)/token_length_list[x-1]
        #print(k[x+1])
        top = top+ (x * y * k[x])

      bottom = 0
      for x in range(1, 6):
        y = len(df_5)/token_length_list[x-1]
        bottom = bottom + (y * k[x])

      temp.append(k[0])
      temp.append(k[1])
      temp.append(top/bottom)
      token1_scores.append(temp)

  return token1_scores

In [None]:
_1 = score(token_counter_1)
_2 = score(token_counter_2)
_3 = score(token_counter_3)
_4 = score(token_counter_4)
_5 = score(token_counter_5)



In [None]:
_total = []
_total.append(_1)
_total.append(_2)
_total.append(_3)
_total.append(_4)
_total.append(_5)


_total = sum(_total, [])



In [None]:
# tested the means here -- 5 doesn't work as expected. Not sure why. Probably will skew the results
df_x=pd.DataFrame(np.array(_5))

df_x[2].mean()

  


2.8747890060645283

In [None]:
df1_sent_tok[0]

[('it', 'PRP'),
 ('hums', 'VBD'),
 ('crackles', 'NNS'),
 ('and', 'CC'),
 ('i', 'JJ'),
 ('think', 'VBP'),
 ('im', 'JJ'),
 ('having', 'VBG'),
 ('problems', 'NNS'),
 ('with', 'IN'),
 ('my', 'PRP$'),
 ('equipment', 'NN'),
 ('as', 'RB'),
 ('soon', 'RB'),
 ('as', 'IN'),
 ('i', 'JJ'),
 ('use', 'VBP'),
 ('any', 'DT'),
 ('of', 'IN'),
 ('my', 'PRP$'),
 ('other', 'JJ'),
 ('cords', 'NNS'),
 ('then', 'RB'),
 ('the', 'DT'),
 ('problem', 'NN'),
 ('is', 'VBZ'),
 ('gone', 'VBN'),
 ('hosa', 'JJ'),
 ('makes', 'VBZ'),
 ('some', 'DT'),
 ('other', 'JJ'),
 ('products', 'NNS'),
 ('that', 'WDT'),
 ('have', 'VBP'),
 ('good', 'JJ'),
 ('value', 'NN'),
 ('but', 'CC'),
 ('based', 'VBN'),
 ('on', 'IN'),
 ('my', 'PRP$'),
 ('experience', 'NN'),
 ('i', 'NN'),
 ('dont', 'VBP'),
 ('recommend', 'VB'),
 ('this', 'DT'),
 ('one', 'CD')]

In [None]:
_total[0][0]

('is', 'VBZ')

In [None]:
#count = []
#for i in df1_sent_tok:
#  for j in i:
#    for k in _total:
     
#      if j == k[0]:
#        #print(k[0])
        
 #       count.append(k[2])
  
#count
  
  

KeyboardInterrupt: ignored

## Finalizing

In [None]:
df_dataset

Unnamed: 0,Reviews,Token_Vector,Rating,Neg_Token_Vector
52,it hums crackles and i think im having problem...,413234,1,566754
89,im a procheapo and i hated this thing theyre n...,346160,1,712708
223,received it in time standard blister packaging...,20636,1,712708
224,these things are terrible one wouldnt fit in m...,346160,1,886937
408,this is a cheap piece of junk that does what i...,369805,1,712708
...,...,...,...,...
10251,true to phosphor bronze these strings have a m...,563117,5,745810
10252,ive used elixirs for about five years now this...,563117,5,712708
10254,i really like these strings while they are no...,413234,5,972789
10256,great just as expected thank to all,667333,5,712708


In [None]:
df_dataset.loc[df_dataset['Rating'] == 1, 'Final'] = 'Neg'  
df_dataset.loc[df_dataset['Rating'] == 2, 'Final'] = 'Neg'  
df_dataset.loc[df_dataset['Rating'] == 3, 'Final'] = 'Neu'  
df_dataset.loc[df_dataset['Rating'] == 4, 'Final'] = 'Pos'  
df_dataset.loc[df_dataset['Rating'] == 5, 'Final'] = 'Pos'  

In [None]:
df_dataset.head()

Unnamed: 0,Reviews,Token_Vector,Rating,Neg_Token_Vector,Final
52,it hums crackles and i think im having problem...,413234,1,566754,Neg
89,im a procheapo and i hated this thing theyre n...,346160,1,712708,Neg
223,received it in time standard blister packaging...,20636,1,712708,Neg
224,these things are terrible one wouldnt fit in m...,346160,1,886937,Neg
408,this is a cheap piece of junk that does what i...,369805,1,712708,Neg


In [None]:
df_dataset.drop(columns=['Rating'])

Unnamed: 0,Reviews,Token_Vector,Neg_Token_Vector,Final
52,it hums crackles and i think im having problem...,413234,566754,Neg
89,im a procheapo and i hated this thing theyre n...,346160,712708,Neg
223,received it in time standard blister packaging...,20636,712708,Neg
224,these things are terrible one wouldnt fit in m...,346160,886937,Neg
408,this is a cheap piece of junk that does what i...,369805,712708,Neg
...,...,...,...,...
10251,true to phosphor bronze these strings have a m...,563117,745810,Pos
10252,ive used elixirs for about five years now this...,563117,712708,Pos
10254,i really like these strings while they are no...,413234,972789,Pos
10256,great just as expected thank to all,667333,712708,Pos


In [None]:
df_dataset.to_csv('final_data.csv')