## Load the Data

In [11]:
import json
import numpy as np

path = 'data/sentiment/Books_small_10000.json'
text = []

with open(path, 'r') as f:
    for comment in f:
        text.append(json.loads(comment))

In [12]:
review = []  
label = []

for line in text:
    
    if line['overall'] <= 3:
        review.append(line['reviewText'])  # Append the review
        label.append(0)  # append the label ~ Negative = 0
    else:
        review.append(line['reviewText'])
        label.append(1)  # append the label ~ Positive = 1


## Cleaning the Data

In [None]:
import re, string
from nltk.corpus import stopwords

def cleaning_data(data):
    tokens = []
    
    for comment in data:
        for word in comment.split():
            tokens.append(word)
    
    print(f'length before the preprocessing phase: {len(tokens)}')

    re_sub = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_sub.sub('', w) for w in tokens]

    tokens = [w for w in tokens if w.isalpha()]

    stop_words = stopwords.words('english')
    tokens = [w for w in tokens if w not in stop_words]

    tokens = [w for w in tokens if len(w) > 2]


    print(f'length after the preprocessing phase: {len(tokens)}')
    
    return tokens


In [None]:
tokens = process_data(review)

## Creating the Vocab

In [None]:
from collections import Counter

Vocab = Counter()
Vocab.update(tokens)

# keep tokens with a minimun occurence
n_occurence = 2
Vocab = [v for v, i in Vocab.items() if i>= n_occurence]
print(f'Vocabulary Size {len(Vocab)}' )

### Saving the vocabulary

In [None]:
def save_file(data, filename):
    save_data = '\n'.join(data)
    file = open(filename, 'w')
    file.write(save_data)
    file.close()
    
save_file(Vocab, 'Vocab.txt')

### Filtering the input_data with known vocab words

In [45]:
def filtering_input_data(input_data, Vocab):
    
    final_data = []
    
    for comment in input_data:
        word_grouped = []

        for word in comment.split():
            
            if word in Vocab:     # Looking if the word in review is in the vocabulary
                word_grouped.append(word)

            sentence = ' '.join(word_grouped)

        final_data.append(sentence)

    return final_data

In [None]:
final_data = filtering_input_data(review, Vocab)
# saving the final data
save_file(final_data, 'final_data.txt')

### Creating the Bag of Words

In [65]:
with open('Vocab.txt', 'r') as f:
    Vocab = f.read().split('\n')

with open('final_data.txt', 'r') as f:
    final_data = f.read().split('\n')
    


In [67]:
# Now that we create the vocabulary and filter the data, we can create the bag of words
from keras.preprocessing.text import Tokenizer

Vocab = set(Vocab)


tokenizer = Tokenizer()
tokenizer.fit_on_texts(Vocab) 

X = tokenizer.texts_to_matrix(final_data, mode = 'count')

In [68]:
import numpy as np
y = np.array(label)
y.shape

(10000,)

In [69]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [70]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(8000, 18408)
(8000,)
(2000, 18408)
(2000,)


In [71]:
from sklearn.svm import SVC

svc_clf = SVC(kernel = 'linear')
svc_clf.fit(x_train, y_train)

svc_clf.score(x_test, y_test)


0.8255

In [29]:
from sklearn.model_selection import GridSearchCV

parameters = {'C' :  (1, 2 , 4, 8, 16, 32),
             'kernel' : ('linear', 'rbf')}

clf = GridSearchCV(SVC(), parameters, cv = 6)
clf.fit(x_train, y_train)

GridSearchCV(cv=6, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': (1, 2, 4, 8, 16, 32),
                         'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [30]:
print(clf.score(x_test, y_test))

0.846


In [111]:
text = 'you are so bad and so ugly!'
final_text = []
temp = []
for word in text.split():
    if word in Vocab:
        temp.append(word)
    sentence = ' '.join(temp)
final_text.append(sentence)

In [112]:
print(text)

final_text = tokenizer.texts_to_matrix(final_text, mode = 'count')
print(final_text)

you are so bad and so ugly!
[[0. 0. 0. ... 0. 0. 0.]]


In [113]:
svc_clf.predict(final_text)

array([1])

In [52]:
text.split()

['This', 'is', 'not', 'a', 'good', 'product']