In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, MaxPooling1D, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, CSVLogger
import keras_metrics

from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np
import pickle

In [19]:
#from cleaner import Cleaner
#cleaner = Cleaner()
#cleaner.create_tokenizer_and_clean()

from cleaner_n_grams import Cleaner_ngrams
cleaner = Cleaner_ngrams()
cleaner.create_tokenizer_and_clean()

cleaning the text data for vectorizer
(100836, 13)
(100267, 13)
loading TfidfVectorizer


In [20]:
filename = "../data/json_bundle_reviews/large-bundle-clean.json"
df = pd.read_json(filename)

y = df.sentiment.values
sentences = df['content'].values
sentences.shape


(100267,)

In [21]:
positive_tweets = df[df.sentiment == 1]
negative_tweets = df[df.sentiment == 0]

print(len(positive_tweets))
print(len(negative_tweets))


positive_tweets_cutoff = int(len(positive_tweets) * (3./4.))
negative_tweets_cutoff = int(len(negative_tweets) * (3./4.))


training_tweets = pd.concat([positive_tweets[:positive_tweets_cutoff], negative_tweets[:negative_tweets_cutoff]])
test_tweets = pd.concat([positive_tweets[positive_tweets_cutoff:], negative_tweets[negative_tweets_cutoff:]])


training_tweets = training_tweets.iloc[np.random.permutation(len(training_tweets))].reset_index(drop=True)
test_tweets = test_tweets.iloc[np.random.permutation(len(test_tweets))].reset_index(drop=True)

49922
50345


In [22]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB



def classify(training_tweets, test_tweets, count_vectorizer):


    training_features = count_vectorizer.transform(training_tweets.content.values)
    training_labels = training_tweets['sentiment'].values

    validation_features = count_vectorizer.transform(test_tweets.content.values)
    validation_labels = test_tweets['sentiment'].values

    classifier = MultinomialNB()
    classifier.fit(training_features, training_labels)
    validation_predictions = classifier.predict(validation_features)

    confusion = confusion_matrix(validation_labels, validation_predictions)
    score = f1_score(validation_labels, validation_predictions)
    return classifier,count_vectorizer, score, confusion



In [23]:
ngram=(1, 1)
count_vectorizer = CountVectorizer(ngram_range=ngram, max_features=3000)
count_vectorizer.fit(df.content)

classifier,count_vectorizer, score, confusion = classify(training_tweets, test_tweets, count_vectorizer)

print ('Total tweets classified: ' + str(len(training_tweets)))
print (score)
print ('Confusion matrix:')
print(confusion)

ngram=(1, 2)
count_vectorizer = CountVectorizer(ngram_range=ngram,max_features=3000)
count_vectorizer.fit(df.content)

classifier,count_vectorizer, score, confusion = classify(training_tweets, test_tweets, count_vectorizer)

print ('Total tweets classified: ' + str(len(training_tweets)))
print (score)
print ('Confusion matrix:')
print(confusion)


ngram=(2, 3)
count_vectorizer = CountVectorizer(ngram_range=ngram,max_features=3000)
count_vectorizer.fit(df.content)

classifier,count_vectorizer, score, confusion = classify(training_tweets, test_tweets, count_vectorizer)

print ('Total tweets classified: ' + str(len(training_tweets)))
print (score)
print ('Confusion matrix:')
print(confusion)

Total tweets classified: 75199
0.9787458424596415
Confusion matrix:
[[12479   108]
 [  416 12065]]
Total tweets classified: 75199
0.9739958491026737
Confusion matrix:
[[12462   125]
 [  514 11967]]
Total tweets classified: 75199
0.952092127371829
Confusion matrix:
[[12283   304]
 [  865 11616]]


In [54]:
#chosen model
import pickle

ngram=(2, 4)
count_vectorizer = CountVectorizer(ngram_range=ngram)
count_vectorizer.fit(df.content)

classifier,count_vectorizer, score, confusion = classify(training_tweets, test_tweets, count_vectorizer)

print ('Total tweets classified: ' + str(len(training_tweets)))
print (score)
print ('Confusion matrix:')
print(confusion)

import bz2
import pickle
import _pickle as cPickle

def compressed_pickle(title, data):
    with bz2.BZ2File(title + '.pbz2', 'w') as f:
        cPickle.dump(data, f)

def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = cPickle.load(data)
    return data

# saving tokenizer
with open('../data/neural_network_config/ngram_vectorizer.pickle', 'wb') as handle:
    pickle.dump(count_vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

compressed_pickle("../data/neural_network_config/ngram_vectorized_compressed",count_vectorizer)

# saving classifier
with open('../data/neural_network_config/classifier_naive_bayes.pickle', 'wb') as handle:
    pickle.dump(classifier, handle, protocol=pickle.HIGHEST_PROTOCOL)

compressed_pickle("../data/neural_network_config/classifier_naive_bayes_compressed",classifier)

Total tweets classified: 75199
0.9853498862320865
Confusion matrix:
[[12359   228]
 [  139 12342]]


In [57]:
#test importation
#with open('../data/neural_network_config/ngram_vectorizer.pickle', 'rb') as handle:
#    vectorizer = pickle.load(handle)
vectorizer = decompress_pickle('../data/neural_network_config/ngram_vectorized_compressed.pbz2')

#test importation
#with open('../data/neural_network_config/classifier_naive_bayes.pickle', 'rb') as handle:
#    classifier = pickle.load(handle)
classifier = decompress_pickle('../data/neural_network_config/classifier_naive_bayes_compressed.pbz2')

positive_test = ["Lo mejor que he visto jamás",
                 "me encanta la tombola",
                 "la tombola es genial, me encanta",
                 "esto es precioso, muy bonito",
                 "me encanta, bello",
                 "lo amo mucho, una preciosidad"]

negative_test = ["¡Jamás voy a usar esta maldita aplicación!  No funciona para nada.",
                 "de desagrada profundamente",
                 "no me gusta",
                 "la tombola no es genial, no me gusta",
                 "esto no es precioso no es bonito",
                 "todo muy feo y desagradable",
                 "me perece muy triste lo que está ocurriendo"
                 "que tristeza, qué pena"]

for text in positive_test:
    print("----------------------------------")
    print(text)
    vals = vectorizer.transform([text])
    print(classifier.predict(vals))
    print(classifier.predict_proba(vals)[0][1])

print("\n\n")

for text in negative_test:
    print("----------------------------------")
    print(text)
    vals = vectorizer.transform([text])
    print(classifier.predict(vals))
    print(classifier.predict_proba(vals)[0][1])

----------------------------------
Lo mejor que he visto jamás
[1]
0.9994635197873718
----------------------------------
me encanta la tombola
[1]
0.9846317428608381
----------------------------------
la tombola es genial, me encanta
[1]
0.9998457027286963
----------------------------------
esto es precioso, muy bonito
[1]
0.9995487078481338
----------------------------------
me encanta, bello
[1]
0.8410007407510434
----------------------------------
lo amo mucho, una preciosidad
[1]
0.7049016527927292



----------------------------------
¡Jamás voy a usar esta maldita aplicación!  No funciona para nada.
[0]
0.00020565692900021603
----------------------------------
de desagrada profundamente
[0]
0.3332524917780581
----------------------------------
no me gusta
[0]
0.12260075940824233
----------------------------------
la tombola no es genial, no me gusta
[0]
0.4862394793527447
----------------------------------
esto no es precioso no es bonito
[1]
0.799363174294659
-------------------