In [None]:
#Analise de Sentimento de revisão de filmes

In [1]:
#Sentiment Analysis with Scikit-Learn
#1 - Importing Libraries
#2 - Importing The dataset
#3 - Text Preprocessing
#4 - Converting Text to Numbers
#5 - Training and Test Sets
#6 - Training Text Classification Model and Predicting Sentiment
#7 - Evaluating The Model
#8 - Saving and Loading the Model

In [2]:
#1 - Importing Libraries
import numpy as np  
import re  
import nltk  
from sklearn.datasets import load_files 
import pickle  
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
#2 - Importing the dataset
#The load_files function automatically divides the dataset into data and target sets. 
#The load_files will treat each folder inside the "txtsentoken" folder as one category and all 
#the documents inside that folder will be assigned its corresponding category.
movie_data = load_files(r"C:\Users\Rober\workSpace_with_python\txt_sentoken")  #tem que passar o endereço completo
x, y = movie_data.data, movie_data.target  
#x é uma lista de 2000 strings e y é um numpy array de 0's e 1's

In [7]:
#3 - Text Processing
documents = []
stemmer = WordNetLemmatizer()

for sen in range(0, len(x)):  
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(x[sen]))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)

    documents.append(document)

In [11]:
#4 - Converting Text to Number
        #utiliza o modelo de Bag of Words para converter texto em numero
from sklearn.feature_extraction.text import CountVectorizer  
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
X = vectorizer.fit_transform(documents).toarray()   #converting text to number

In [12]:
#No script acima, quando chamo a função CountVectorizer e passo alguns parametro
#max_features = só considera os primeiros 1500 termos ordenados pela sua frequência do termo no corpus
#min_df = ignora os termos que não aparecem em pelo menos 5 documentos
#max_df = só inclui os termos que ocorrem em mais de 70% de todos os documentos

In [None]:
#The bag of words approach works fine for converting text to numbers. However, it has one drawback. 
#It assigns a score to a word based on its occurrence in a particular document. 
#It doesn't take into account the fact that the word might also be having a high frequency of occurrence in other documents 
#as well. TFIDF resolves this issue by multiplying the term frequency of a word by the inverse document frequency. 
#The TF stands for "Term Frequency" while IDF stands for "Inverse Document Frequency". 
from sklearn.feature_extraction.text import TfidfTransformer  
tfidfconverter = TfidfTransformer()  
X = tfidfconverter.fit_transform(X).toarray()  

In [19]:
#5 - Training and Test Sets
from sklearn.model_selection import train_test_split  
#this script divides data into 20% test set and 80% training set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [20]:
#6 - Training Text Classification Model and Predicting Sentiment
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)  
classifier.fit(X_train, y_train) 
y_pred = classifier.predict(X_test) 

In [34]:
#7 - Evaluating The Model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy score: ', format(accuracy_score(y_test, y_pred)))
print('Precision score: ', format(precision_score(y_test, y_pred)))
print('Recall score: ', format(recall_score(y_test, y_pred)))
print('F1 score: ', format(f1_score(y_test, y_pred)))

Accuracy score:  0.855
Precision score:  0.8526315789473684
Recall score:  0.84375
F1 score:  0.8481675392670157


In [35]:
#8 - Saving and Loading the Model
with open('text_classifier', 'wb') as picklefile:  
    pickle.dump(classifier,picklefile)
with open('text_classifier', 'rb') as training_model:  
    model = pickle.load(training_model)

y_pred2 = model.predict(X_test)

print('Accuracy score: ', format(accuracy_score(y_test, y_pred2)))
print('Precision score: ', format(precision_score(y_test, y_pred2)))
print('Recall score: ', format(recall_score(y_test, y_pred2)))
print('F1 score: ', format(f1_score(y_test, y_pred2)))


Accuracy score:  0.855
Precision score:  0.8526315789473684
Recall score:  0.84375
F1 score:  0.8481675392670157


In [37]:
#Aplicando o algoritmos de naive bayes
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)
predictions = naive_bayes.predict(X_test)

print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.795
Precision score:  0.7669902912621359
Recall score:  0.8229166666666666
F1 score:  0.7939698492462312


In [43]:
#Aplicando o algoritmo knn
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=100)
neigh.fit(X_train, y_train)
y_pred3 = neigh.predict(X_test)

print('Accuracy score: ', format(accuracy_score(y_test, y_pred3)))
print('Precision score: ', format(precision_score(y_test, y_pred3)))
print('Recall score: ', format(recall_score(y_test, y_pred3)))
print('F1 score: ', format(f1_score(y_test, y_pred3)))

Accuracy score:  0.735
Precision score:  0.715
Recall score:  0.7447916666666666
F1 score:  0.7295918367346939


In [45]:
#Aplicando o algoritmo k-means
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_train, y_train)
y_pred4 = kmeans.predict(X_test)

print('Accuracy score: ', format(accuracy_score(y_test, y_pred4)))
print('Precision score: ', format(precision_score(y_test, y_pred4)))
print('Recall score: ', format(recall_score(y_test, y_pred4)))
print('F1 score: ', format(f1_score(y_test, y_pred4)))

Accuracy score:  0.595
Precision score:  0.5609756097560976
Recall score:  0.71875
F1 score:  0.6301369863013699
