## Load data and split train and test set

In [57]:
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
df = pd.read_csv('./clean_data.csv')
df.shape

(15863, 2)

In [25]:
train, test = train_test_split(df, test_size = 0.2)
print(train.shape)
print(test.shape)

(12690, 2)
(3173, 2)


## Transform input

In [None]:
from sklearn.model_selection import train_test_split
from collections import Counter
import nltk
nltk.download('punkt')
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder

In [47]:
BOW_train = []

for text in train['Text']:
  tokens = nltk.word_tokenize(text)
  BOW_train.append(Counter(tokens))

In [48]:
input_encoder = DictVectorizer()
X = input_encoder.fit_transform(BOW_train)

In [49]:
output_encoder = LabelEncoder()
y = output_encoder.fit_transform(train['Sentiment'])

In [51]:
BOW_test = []

for text in test['Text']:
  tokens = nltk.word_tokenize(text)
  BOW_test.append(Counter(tokens))

In [54]:
X_test = input_encoder.transform(BOW_test)

In [55]:
y_test = output_encoder.transform(test['Sentiment'])

## Build multinomial NB model

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [50]:
classifier = MultinomialNB()
classifier.fit(X, y) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Test!

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [56]:
y_pred = classifier.predict(X_test)

In [59]:
print("Exactitud:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average = "macro"))
print("Exhaustividad:", recall_score(y_test, y_pred, average = "macro"))
print("F1 score:", f1_score(y_test, y_pred, average = "macro"))

Exactitud: 0.7607942010715412
Precision: 0.7655589794260738
Exhaustividad: 0.7535353535353535
F1 score: 0.7551439896173138


## Export model


In [62]:
import pickle

In [61]:
with open('sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(classifier, f)

## Reloading model

In [63]:
with open('sentiment_classifier.pkl', 'rb') as f:
    my_sentiment_classifier = pickle.load(f)

In [101]:
test.iloc[4]

Text         thank another solid plan return
Sentiment                                  1
Name: 1994, dtype: object

In [102]:
my_sentiment_classifier.predict(X_test[4])

array([1])