In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn import metrics
from sklearn.svm import LinearSVC
import string
from nltk import pos_tag
from nltk.corpus import stopwords
# nltk.download('stopwords')

from mi_helper import *

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import wordnet
import re

from numpy import array 

In [3]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t")
print(dataset["feedback"].value_counts())
print(dataset.shape)
# data_test = pd.read_csv("test_set.csv", sep = "\t")
# data_test.drop("Unnamed: 0", axis = 1, inplace = True)
# dataset = pd.concat([data_test, dataset])
# dataset.dropna(inplace=True)
dataset.drop(dataset[dataset.rating == 3].index, inplace=True) #droppa recensioni con 3-4 stelle
dataset.drop(dataset[dataset.rating == 4].index, inplace=True) #droppa recensioni con 3-4 stelle
print(dataset.shape)
print(dataset["feedback"].value_counts())

1    2893
0     257
Name: feedback, dtype: int64
(3150, 5)
(2543, 5)
1    2286
0     257
Name: feedback, dtype: int64


In [4]:
custom_stopwords = ['echo',
 'alexa',
 'dot',
 "star",
 'amazon',
 'prime',
 '2nd',
 'generation',
 "fire",
 "stick",
 "firestick",
 "skype",
 "facetime",
 '1st',
 '3rd',
 '4th',
 '5th',
 "hub",
 "hulu",
 'google',
 'netflix',
 'youtube',
 'philip',
 'tp-link',
 'fourth',
 'roku',
 'i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'nor',
 'only',
 'own',
 'same',
 'so',
 'than',
 'too',
 'very',
 's',
 't',
 'can',
 'will',
 'just',
 'don',
 "...",
 'should',
 "should've",
 'now',
 'd',
 'll',
 'm',
 'o',
 're',
 've',
 'y',
 'ain',
 'aren',
 "aren't",
 'couldn',
 'didn',
 'doesn',
 'hadn',
 'hasn',
 'haven',
 'isn',
 'ma',
 'mightn',
 "mightn't",
 'mustn',
 "mustn't",
 'needn',
 "needn't",
 'shan',
 "shan't",
 'shouldn',
 'wasn',
 'weren',
 'won',
 'wouldn']


def negation_handler(sentence):	
    temp = int(0)
    for i in range(len(sentence)):
        if sentence[i-1] in ['not',"n't", "no", "without"]:
            antonyms = []
            for syn in wordnet.synsets(sentence[i]):
                syns = wordnet.synsets(sentence[i])
                w1 = syns[0].name()
                temp = 0
                for l in syn.lemmas():
                    if l.antonyms():
                        antonyms.append(l.antonyms()[0].name())
                max_dissimilarity = 0
                for ant in antonyms:
                    syns = wordnet.synsets(ant)
                    w2 = syns[0].name()
                    syns = wordnet.synsets(sentence[i])
                    w1 = syns[0].name()
                    word1 = wordnet.synset(w1)
                    word2 = wordnet.synset(w2)
                    if isinstance(word1.wup_similarity(word2), float) or isinstance(word1.wup_similarity(word2), int):
                        temp = 1 - word1.wup_similarity(word2)
                    if temp>max_dissimilarity:
                        max_dissimilarity = temp
                        antonym_max = ant
                        sentence[i] = antonym_max
                        sentence[i-1] = ''
    while '' in sentence:
        sentence.remove('')
    return sentence

In [5]:
def get_wordnet_pos(treebank_tag):
    """
    return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
    """
    if treebank_tag.startswith('J'):
        return "a"
    elif treebank_tag.startswith('V'):
        return "v"
    elif treebank_tag.startswith('N'):
        return "n"
    elif treebank_tag.startswith('R'):
        return "r"
    else:
        return "n"
        
pos_list = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]

def tokenize_list_of_text(list_of_text, custom_stopwords = [], pos_filter = False, pos_list = []):
    """Tokenizza tutte le recensioni, pulisce da stopwords, elimina token <= 2 caratteri e lemmatizza. Ritorna la colonna delle recensioni."""

    tokenizer = nltk.tokenize.TweetTokenizer()
    lemmatizer = nltk.WordNetLemmatizer()
    detokenizer = TreebankWordDetokenizer()

    tokenized_reviews = []
    sent_tokenized_reviews = []
    for review in list_of_text: #pulisce le recensioni
        review = re.sub(r'\d+', '', review)
        clean_text = ""
        tokens = nltk.tokenize.word_tokenize(review, language='english', preserve_line=False)
        tokens = negation_handler(tokens)
        tokens = [w.lower() for w in tokens]
        tokens_pos = pos_tag(tokens)
        lemmatized_tokens = [(lemmatizer.lemmatize(w, get_wordnet_pos(pos)), pos) for w, pos in tokens_pos]
        if pos_filter:
            clean_tokens = [(w, pos) for w, pos in lemmatized_tokens if w not in string.punctuation and len(w)>2 and w not in custom_stopwords and pos in pos_list]
        else:
            clean_tokens = [(w, pos) for w, pos in lemmatized_tokens if w not in string.punctuation and len(w)>2 and w not in custom_stopwords]
        sent_tokenized_reviews.append([w for w, pos in clean_tokens])
        tokenized_reviews.append(detokenizer.detokenize([w for w, pos in clean_tokens]))
    
    n_tokens = []
    for sent in sent_tokenized_reviews:
        for w in sent:
            n_tokens.append(w)
    print("total number of tokens extracted are:", len(set(n_tokens)))
    return tokenized_reviews,  sent_tokenized_reviews # ritorna una tupla!

In [6]:
tokenized_reviews, sentences = tokenize_list_of_text(dataset["verified_reviews"], custom_stopwords, pos_filter=False, pos_list = pos_list)

dataset["verified_reviews"] = tokenized_reviews

total number of tokens extracted are: 2767


In [126]:
# CREAZIONE DI UN SAMPLE DATASET BILANCIATO
# prende le prime n recensioni positive di lunghezza maggiore, dove n è il numero di recensioni negative
def create_balanced_dataset(dataset):
    """Bilancia il dataset uniformando il numero di recensioni negative e positive. Prende in input il dataset"""
    reviews_1 = list(dataset[dataset["feedback"] == 1]["verified_reviews"])
    reviews_0 = list(dataset[dataset["feedback"] == 0]["verified_reviews"])
    reviews_1.sort(key=len, reverse = True)
    sample_1 = reviews_1[:len(reviews_0)]
    verified_reviews_sample = []
    feedback_sample = []
    verified_reviews_sample.extend(sample_1)
    verified_reviews_sample.extend(reviews_0)
    feedback_sample.extend([1 for i in range(len(sample_1))])
    feedback_sample.extend([0 for i in range(len(reviews_0))])
    dataset = pd.DataFrame({"verified_reviews":verified_reviews_sample, "feedback": feedback_sample})
    print(dataset["feedback"].value_counts())
    return dataset


dataset = create_balanced_dataset(dataset)

1    257
0    257
Name: feedback, dtype: int64


In [7]:
dataset

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,love,1
1,5,31-Jul-18,Charcoal Fabric,love,1
3,5,31-Jul-18,Charcoal Fabric,lot fun thing old learns dinosaur control ligh...,1
4,5,31-Jul-18,Charcoal Fabric,music,1
5,5,31-Jul-18,Heather Gray Fabric,receive gift need another bluetooth something ...,1
...,...,...,...,...,...
3144,5,30-Jul-18,Black Dot,love,1
3145,5,30-Jul-18,Black Dot,perfect kid adult everyone,1
3146,5,30-Jul-18,Black Dot,listen music search location check time look w...,1
3147,5,30-Jul-18,Black Dot,love thing run entire home light thermostat fr...,1


In [28]:
tokenizer = nltk.tokenize.TweetTokenizer()
cv = CountVectorizer(stop_words="english", ngram_range=(1, 3), tokenizer=tokenizer.tokenize, min_df = 4) #count == frequenza
text_counts = cv.fit_transform(dataset["verified_reviews"]) 
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset["feedback"], test_size = 0.20, random_state=10) #divisione in train e test

In [29]:
text_counts

<2543x1462 sparse matrix of type '<class 'numpy.int64'>'
	with 23321 stored elements in Compressed Sparse Row format>

In [30]:
len(cv.vocabulary_)

1462

In [31]:
cv.vocabulary_ # A mapping of terms to feature indices.

{'love': 701,
 'lot': 697,
 'fun': 435,
 'thing': 1247,
 'old': 860,
 'control': 232,
 'light': 637,
 'play': 903,
 'game': 446,
 'like': 643,
 'nice': 845,
 'sound': 1161,
 'music': 800,
 'lot fun': 698,
 'control light': 233,
 'play game': 906,
 'game like': 447,
 'nice sound': 848,
 'sound play': 1169,
 'play music': 909,
 'receive': 1009,
 'gift': 451,
 'need': 832,
 'bluetooth': 127,
 'easily': 329,
 'smart': 1139,
 'speaker': 1181,
 'wait': 1371,
 'bluetooth play': 129,
 'music easily': 809,
 'smart speaker': 1147,
 'bluetooth play music': 130,
 'think': 1260,
 'purchase': 968,
 'work': 1418,
 'room': 1047,
 'house': 547,
 'really': 998,
 'feature': 414,
 'offer': 858,
 'room house': 1049,
 'really like': 1003,
 'like feature': 650,
 'light house': 639,
 'look': 687,
 'great': 472,
 'look great': 694,
 'listen': 666,
 'song': 1154,
 'news': 842,
 'weather': 1396,
 'information': 569,
 'love listen': 728,
 'news weather': 844,
 'information great': 570,
 'send': 1077,
 'year': 145

In [33]:
#get_feature_names_out([input_features])
#Get output feature names for transformation.

len(cv.get_feature_names_out())

1462

In [34]:
X_train[0,:] # prima recensione

<1x1462 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [35]:
print(X_train[0,:])

  (0, 1247)	2
  (0, 968)	1
  (0, 666)	1
  (0, 455)	1
  (0, 423)	1
  (0, 144)	1
  (0, 664)	1
  (0, 424)	1
  (0, 89)	1
  (0, 1379)	1
  (0, 276)	1
  (0, 505)	1
  (0, 506)	1
  (0, 60)	1
  (0, 1449)	1
  (0, 278)	1
  (0, 1314)	1
  (0, 213)	1
  (0, 1248)	1
  (0, 1383)	1
  (0, 874)	1


In [36]:
cv.inverse_transform(X_train[0,:]) #0 ==> prima recensione

[array(['thing', 'purchase', 'listen', 'good', 'flash', 'briefing', 'list',
        'flash briefing', 'awesome', 'want', 'definitely', 'grocery',
        'grocery list', 'app', 'worth', 'definitely worth', 'unretentive',
        'complaint', 'thing awesome', 'want listen', 'pair'], dtype='<U25')]

In [37]:
for feat,freq in zip(cv.inverse_transform(X_train[0,:])[0],X_train[0,:].data):
    print(feat,freq)

thing 2
purchase 1
listen 1
good 1
flash 1
briefing 1
list 1
flash briefing 1
awesome 1
want 1
definitely 1
grocery 1
grocery list 1
app 1
worth 1
definitely worth 1
unretentive 1
complaint 1
thing awesome 1
want listen 1
pair 1


In [69]:
#For classification we'll set 'chi2'  method as a scoring function. The target number of features is defined by k parameter
select = SelectKBest(chi2, k=1462)  # feature selection
# select = SelectKBest(chi2, k="all")  # feature selection for balanced with pos filter
select.fit(X_train,Y_train)
X_train_sel = select.transform(X_train)
X_test_sel = select.transform(X_test)

In [70]:
#We've selected 3 best features in x data. To identify the selected features we use get_support() function and filter out them from the features name list. The z object contains selected x data
filter = select.get_support() #filtra le parole selezionando quelle contenenti nei k 5000
sum(filter)

1462

In [71]:
X_train_sel

<2034x1462 sparse matrix of type '<class 'numpy.int64'>'
	with 18748 stored elements in Compressed Sparse Row format>

In [72]:
print(X_test_sel[0,:])

  (0, 701)	1
  (0, 1161)	1
  (0, 472)	1
  (0, 330)	1
  (0, 1320)	1
  (0, 219)	1
  (0, 497)	1
  (0, 720)	1
  (0, 1163)	1
  (0, 332)	1
  (0, 498)	1


In [73]:
print(cv.inverse_transform(select.inverse_transform(X_train_sel[0,:]))) #applichi l'inverse_transform sui vettori che poi hai utilizzato con il Select

[array(['app', 'awesome', 'briefing', 'complaint', 'definitely',
       'definitely worth', 'flash', 'flash briefing', 'good', 'grocery',
       'grocery list', 'list', 'listen', 'pair', 'purchase', 'thing',
       'thing awesome', 'unretentive', 'want', 'want listen', 'worth'],
      dtype='<U25')]


In [74]:
tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_sel)
X_train_vec = tfidf.transform(X_train_sel)
X_test_vec =tfidf.transform(X_test_sel)

In [75]:
print(X_train_vec[0,:])

  (0, 1449)	0.19705656278639627
  (0, 1383)	0.27840639840927134
  (0, 1379)	0.15825667376474645
  (0, 1314)	0.24703612229954555
  (0, 1248)	0.25274338253796175
  (0, 1247)	0.260528684704419
  (0, 968)	0.14538929447401
  (0, 874)	0.25274338253796175
  (0, 666)	0.15459701490117936
  (0, 664)	0.18852167658100372
  (0, 506)	0.24209226860678018
  (0, 505)	0.24209226860678018
  (0, 455)	0.1287168494001585
  (0, 424)	0.25274338253796175
  (0, 423)	0.2377314805978338
  (0, 278)	0.24209226860678018
  (0, 276)	0.20141735079534265
  (0, 213)	0.21642925273547062
  (0, 144)	0.23383062279826533
  (0, 89)	0.1735097746408758
  (0, 60)	0.19076623686416105


In [76]:
cv.inverse_transform(select.inverse_transform(X_train_vec[0,:]))

[array(['app', 'awesome', 'briefing', 'complaint', 'definitely',
        'definitely worth', 'flash', 'flash briefing', 'good', 'grocery',
        'grocery list', 'list', 'listen', 'pair', 'purchase', 'thing',
        'thing awesome', 'unretentive', 'want', 'want listen', 'worth'],
       dtype='<U25')]

In [77]:
for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[0,:]))[0],X_train_vec[0,:].data,X_train_sel[1,:].data):
    print(feat,round(weight,4),freq)

app 0.1971 1
awesome 0.2784 1
briefing 0.1583 1
complaint 0.247 1
definitely 0.2527 1
definitely worth 0.2605 1
flash 0.1454 1


In [51]:
# create a dataframe with words, tf_idf score and freq
total_scores = []

for i in range(X_train_vec.shape[0]):
    for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[i,:]))[0],X_train_vec[i,:].data,X_train_sel[i,:].data):
        total_scores.append((feat,round(weight,4),freq))

words = []
tf_idf_score = []
freq = []
for i in range(len(total_scores)):
    words.append(total_scores[i][0])
    tf_idf_score.append(total_scores[i][1])
    freq.append(total_scores[i][2])

scores_df = pd.DataFrame()
scores_df["words"] = words
scores_df["tf_idf_score"] = tf_idf_score
scores_df["freq"] = freq

In [163]:
scores_df.sort_values(by = "tf_idf_score", ascending = False).head(50) # sort and show the df

Unnamed: 0,words,tf_idf_score,freq
2653,reboot,1.0,1
4285,malfunction,1.0,1
7706,let,1.0,1
3615,worthless,1.0,1
1471,radio,1.0,1
3645,good,1.0,1
7965,order,1.0,1
7658,disconnect,1.0,1
3780,return,1.0,1
5024,feature,1.0,1


In [168]:
X_train_vec

<411x3000 sparse matrix of type '<class 'numpy.float64'>'
	with 10194 stored elements in Compressed Sparse Row format>

In [78]:
svm = LinearSVC()  # linear svm with default parameters
svm_clf = svm.fit(X_train_vec,Y_train)
predictions = svm_clf.predict(X_test_vec)

In [79]:
len(predictions), sum(predictions)

(509, 477)

In [81]:
print(metrics.classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.75      0.55      0.63        44
           1       0.96      0.98      0.97       465

    accuracy                           0.94       509
   macro avg       0.85      0.76      0.80       509
weighted avg       0.94      0.94      0.94       509



In [82]:
feature_names = cv.get_feature_names_out()
feature_names

array(["'", "' t", "' ve", ..., 'yesterday', 'young', 'zigbee'],
      dtype=object)

In [83]:
print(select.scores_)

[20.14864446  4.64998028  0.10859556 ...  0.2339374   0.46787479
 10.10471727]


In [84]:
feature_names = cv.get_feature_names_out()
feats_w_score = list()
for index,(selected,score) in enumerate(zip(filter,select.scores_)):
    feats_w_score.append((score,selected,feature_names[index]))
feats_w_score = sorted(feats_w_score, reverse = True)
len(feats_w_score)

1462

In [85]:
feats_w_score[:10] # this contains k score, wether it is a selcted feature and the word

[(143.51178383242657, True, 'stop'),
 (118.06042183910712, True, 'return'),
 (118.06042183910712, True, 'month'),
 (105.01799146990228, True, 'device'),
 (104.8004965542845, True, 'try'),
 (80.9782204891536, True, 'send'),
 (77.34342065573009, True, 'disappointed'),
 (75.91332237848235, True, 'unplug'),
 (75.91332237848235, True, 'half'),
 (73.20862240991252, True, 'idle')]

In [86]:
type(svm)

sklearn.svm._classes.LinearSVC

In [87]:
len(svm.coef_)

1

In [88]:
feats_w_classifier_weight = list()
for index,weight in enumerate(select.inverse_transform(svm.coef_)[0]):
    if weight!=0:
        feats_w_classifier_weight.append((weight,feature_names[index]))
feats_w_classifier_weight = sorted(feats_w_classifier_weight)
len(feats_w_classifier_weight)

1359

In [89]:
feats_w_classifier_weight[-100:] #features positive

[(0.4187849229630476, 'entertainment'),
 (0.4204812769684143, 'smart bulb'),
 (0.42070310205828776, 'spotify'),
 (0.4217449194711628, 'recall'),
 (0.4227318090506695, 'gen'),
 (0.4234822829184993, 'group'),
 (0.42492796299904495, 'mainly'),
 (0.42637628824391327, 'work need'),
 (0.4296407034370381, 'thanks'),
 (0.43639178970809706, 'television'),
 (0.4378659207839122, 'regular'),
 (0.43793179638631374, 'thing work'),
 (0.43905302679606173, 'perform'),
 (0.43907515162969146, 'product work'),
 (0.43907849941939187, 'original'),
 (0.4406303716875634, 'stuff'),
 (0.4421904138694338, 'great little'),
 (0.4435159530682687, 'include'),
 (0.4447592590025433, 'new'),
 (0.44838430965363096, 'ease'),
 (0.45124245448902167, 'easy use'),
 (0.451481081354529, 'ring'),
 (0.4537306539171337, 'buy day'),
 (0.453751143342791, 'want bedroom'),
 (0.45972859553015855, 'base'),
 (0.4600166567666674, 'different'),
 (0.4616156799535185, 'satisfied'),
 (0.46397992361707335, 'primarily'),
 (0.4672360849146571, 

In [90]:
feats_w_classifier_weight[:100] #features negative

[(-1.7639938991801491, 'return'),
 (-1.6537765242315596, 'terrible'),
 (-1.6420151084897188, 'poor'),
 (-1.6090685590092182, 'idle'),
 (-1.5040417741823626, 'useless'),
 (-1.4572855429895542, 'adapter'),
 (-1.4518721180490894, 'dont'),
 (-1.4481631594748765, 'stop'),
 (-1.413233447399262, 'speak'),
 (-1.3580737809968595, 'siri'),
 (-1.3386968512572432, 'disconnect'),
 (-1.29961851186135, 'odd'),
 (-1.2966500289893972, 'month'),
 (-1.2830713911932554, 'mode'),
 (-1.2817504201646035, 'unplug'),
 (-1.2452082173766401, 'half'),
 (-1.2284555356652729, 'malfunction'),
 (-1.2190139770168391, 'sad'),
 (-1.2014263216212686, 'realize'),
 (-1.197559842233605, 'honestly'),
 (-1.1917114110328355, 'send'),
 (-1.162401719409023, 'slow'),
 (-1.1196395358886133, 'hardly'),
 (-1.1124216113332002, 'disappointed'),
 (-1.0950953240594261, 'disappointing'),
 (-1.0923507130956045, 'stop work'),
 (-1.0869818696413311, 'sound terrible'),
 (-1.0703928266032325, 'feature'),
 (-1.0672783418238003, 'party'),
 (-1.

In [91]:
df_scores = pd.DataFrame()
scores = []
words = []
for entry in feats_w_classifier_weight:
    scores.append(entry[0])
    words.append(entry[1])

df_scores["scores"] = scores
df_scores["words"] = words

df_scores.sort_values(by = "scores", ascending = False).head(15) # use tail for seeing the positive words

Unnamed: 0,scores,words
1358,2.737147,love
1357,1.5077,great
1356,1.464291,easy
1355,1.212697,enjoy
1354,1.049035,quickly
1353,0.997567,far
1352,0.919679,dad
1351,0.917233,fun
1350,0.866686,best
1349,0.865799,learn


In [92]:
stringa = "the product is disappointing, not good at all"
clean = tokenize_list_of_text([stringa])[0]
clean

total number of tokens extracted are: 5


['the product disappointing evil all']

In [93]:
vector = cv.transform(clean)
vector = select.transform(vector)
vector = tfidf.transform(vector)
predicted = svm.predict(vector)
print(predicted)

[0]


In [94]:
MNB = MultinomialNB()  # MNB with default parameters
MNB_clsf = MNB.fit(X_train_vec,Y_train)
predictions = MNB_clsf.predict(X_test_vec)
print(metrics.classification_report(predictions, Y_test))

              precision    recall  f1-score   support

           0       0.14      1.00      0.24         6
           1       1.00      0.92      0.96       503

    accuracy                           0.93       509
   macro avg       0.57      0.96      0.60       509
weighted avg       0.99      0.93      0.95       509



In [None]:
len(Y_test)

509

In [None]:
# WE CAN PASS W2V VECTORS AS SVM TRAINING