In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import homogeneity_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import accuracy_score
from bs4 import BeautifulSoup
import re


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Base de dados

Utilizei um corpus de text sobre análises de filmes no conhecido site IMDB, nele esta presente a coluna da análise e a coluna sentimento que sintetiza a qualidade do filme assistido pelo crítico. Nesse caso vamos tentar estrapolar novas análises de filmes usando NLP e Machine Learning se avaliação foi positiva ou negativa.

In [None]:
data_raw = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [None]:
data_raw.head()

In [None]:
data_raw.info()

In [None]:
data_raw["review"][0] #amostra do dataset

In [None]:
data_raw["sentiment"][0] #amostra do dataset

In [None]:
data_raw["review"].groupby(data_raw["sentiment"]).count()

## BOW (Bag of Words)

Como posso fazer um computador entender textos?

Primeiramente temos que converter textos normais para um tipo de representação numérica para que Machine Learning possa processar, uma abordagem tradicional é a utilização da técnica Bag of Words que consiste em usar o vocubulario de todos o documentos analisados (Corpus) e quebrar os textos ao ponto de lidar apenas com a frequência de palavras, como diz literalmente o nome da técninca traduzida do português essas informações viram um "Saco de palavras", é levado em conta a frequencia utilizadas de certas palavras dentro di voculabulario. Segue exemplo em duas sentenças:

Frase 1: "O gato agarrou o cachorro"

Frase 2: "O dono agarrou o cachorro e o gato"

Para as duas sentenças, o vocabulo segue:

{ O, GATO, AGARROU, CACHORRO, DONO, E}

Para conseguir o "saco de palavras", contamos o numero de vezes que a palavra ocorre em cada Frase. Na sentença 1, "O" aparece duas vezes, também as palavras "GATO", "AGARROU" e "CACHORRO" aparece uma vez, então o novo registro da senteça 1 numéricamente fica:

Vocab = { O, GATO, AGARROU, CACHORRO, DONO, E}

Frase 1: { 2, 1, 1, 1, 1, 0, 0}

Frase 2: { 3, 1, 1, 1, 1, 1, 1}

Para não hiperdimencionalizar o vetor de colunas e causar riscos de performances no modelo, vamos escolher um espaço amostal máximo de vocábulos. Abaixo, utilizaremos as 5000 palavras mais frequentes presentes no corpus (lembrando que tiramos as stops words)

## Argumentação de dados

### Tirar as Tags de HTML na base de dados

In [None]:
exemplo = BeautifulSoup(data_raw['review'][0], "lxml" )
print (data_raw['review'][0])
print('')
print (exemplo.get_text())

### Retirar os números, pontuação e Caracteres Especiais da base de dados

In [None]:
print(exemplo.get_text())
print('')
exemplo =re.sub("[^a-zA-Z]"," ",exemplo.get_text())
print(exemplo)

### Padronizar o corpus em letras minúsculas 

In [None]:
print(exemplo)
print('')
exemplo = exemplo.lower()
print(exemplo)

### Aplicar a Tokenização no corpus

In [None]:
print(exemplo)
print('')
exemplo = exemplo.split()
print(exemplo)

### Aplicar a técnica de Stem ( Transformar palavras suas respectivas formas raízes ou primitivas)

In [None]:
def Stemming(sentence):
    stemmer = SnowballStemmer("english")
    phrase = []
    for word in sentence:
        phrase.append(stemmer.stem(word.lower()))
    return phrase

In [None]:
print(exemplo)
print('')
print(Stemming(exemplo))

### Função que integra todas as transformações necessárias para o corpus

In [None]:
def review_format(raw_review):
    review = BeautifulSoup(raw_review, "lxml" )
    review = re.sub("[^a-zA-Z]"," ",review.get_text())
    #review = unidecode(review)
    review = review.lower().split()
    review = Stemming(review)
    return(' '.join(review)) 

In [None]:
exemplo = data_raw['review'][0]
print(exemplo)

In [None]:
%time exemplo_clean = review_format(exemplo)
print('')
print(exemplo_clean)

In [None]:
num_reviews = data_raw['review'].size
print (num_reviews)

In [None]:
# loop para aplicar as transformações em cada registro da coluna review do dataset
clean_data_review= []
for i in range(0,num_reviews):
    clean_data_review.append(review_format(data_raw['review'][i]))

In [None]:
data_y = data_raw['sentiment']

In [None]:
clean_data_review = np.array(clean_data_review)

In [None]:
# Split dos dados em treino e validação
# Estou utilizando apenas 20% do dataset pois a VM do kaggle não estava dando overflow na RAM e restartava a VM.
X_train, X_test, y_train, y_test = train_test_split(clean_data_review, data_y, test_size=0.80, random_state=42)

## Clusterização

### Aplicação da técninca de TfiDF de NLP

![](https://plumbr.io/app/uploads/2016/06/tf-idf.png)

É abreviação do termo em ingles term *frequency–inverse document frequency*, que significa frequência do termo–inverso da frequência nos documentos, é um indice que mede a importancia de uma palavra em um documento em relação todas a palavras do documento, portanto o valor de indice **TfiDF** aumenta quando há mais ocorrências dessa palavra em todo o corpus. Aplicamos essa técnica em NLP pois é um eficiente modo de modelar problemas de linguagem natural em computadores pois lidamos com a frequencia de palavras chaves para realizar decisões e essa frequencia támbem informa se essa palavra é comum ou escassa no corpus.

In [None]:
vec = TfidfVectorizer(stop_words="english")
%time vec.fit(X_train)
features = vec.transform(X_train)

### Kmeans

In [None]:
cls = MiniBatchKMeans(n_clusters=2, random_state=42)
cls.fit(features)

In [None]:
cls.predict(features)

In [None]:
# Reduzir a dimenção do cluster para 2D e conseguir plotar
pca = PCA(n_components=2, random_state=42)
reduced_features = pca.fit_transform(features.toarray())

reduced_cluster_centers = pca.transform(cls.cluster_centers_)

In [None]:
# Plot da clusterização feita por Kmeans
plt.scatter(reduced_features[:,0], reduced_features[:,1], c=cls.predict(features))
plt.scatter(reduced_cluster_centers[:, 0], reduced_cluster_centers[:,1], marker='x', s=150, c='b')

O score de homogeneity é um paremtro de medição qual é a taxa de classes iguais em um cluster. Nesse caso é medido em uma escala de 0 à 1, quanto mais perto do 1 mais homogênea é esse cluster. 

In [None]:
homogeneity_score(y_train, cls.predict(features))

O silhouette_score utiliza distancia entre pontos  para medir quanto um objeto é similar (coesão) ao cluster que ele está presente e também outros clusters existentes. A escala de medição inicia de -1 à 1, mais perto de 1 mais aquele objeto é similar ao cluster que ele habita e menos coeso com outros clusters.

In [None]:
silhouette_score(features, labels=cls.predict(features))

In [None]:
features_test = vec.transform(X_test)
homogeneity_score(y_test, cls.predict(features_test))

In [None]:
silhouette_score(features_test, labels=cls.predict(features_test))

In [None]:
def predict_coment(word):
    to_array=[]
    to_array.append(review_format(word))
    sample_final=vec.transform(to_array)
    sample_final=sample_final.toarray()
    result = cls.predict(sample_final)   
    return result[0], word

### Estrapolação Review do Filme Parasita, ganhador do oscar de 2020 de melhor Filme.

[Link review](https://www.imdb.com/review/rw5362398/?ref_=tt_urv)
https://www.imdb.com/review/rw0980757/?ref_=tt_urv

In [None]:
review1 = "Parasite was directed and written by Bong Joon Ho and tells the story of the Kim family and their life-changing involvement with the Park family. Parasite can best be described as astonishing, astounding, stunning or any other synonym of amazing. It is so far my favourite film of the year and one of my favourite films of the decade. The very idea of the plot is simple but incredibly hard to execute and that's, why it's editing, is pitch-perfect and leads to an unexpectedly shocking and brutal ending. The biggest forte of Parasite is the screenplay as it impeccably mixes comedy, drama and horror featuring flawless pacing, breath-taking cinematography, a beautiful score and a brilliant cast making it a masterpiece. Parasite also perfectly presents the subject of classism, showing us how both the working class and the upper class view each other and the people around them. Themes of capitalism can also be felt throughout the film, but Parasite shouldn't be mistaken as a pro-capitalism film as it doesn't support or hate anyone or any side; it's ambiguity also contributes to this factor. Taking everything into account, Parasite is a true work of art and a rare and extraordinary masterpiece that should be viewed by everyone at some point in their lives, especially film lovers."

In [None]:
%time predict_coment(review1)

### Estrapolação Review do Filme Cidade de Deus.

[Link review](https://www.imdb.com/review/rw0980757/?ref_=tt_urv)


In [None]:
review2 = "Cidade de Deus seems to have a lot of praise on the IMDb boards, and with good reason too. It simply is, in my opinion, one of the best contemporary films ever made. Based on true events and characters who live in the overlooked and poverty stricken slums in the shadows of Rio de Janiero, where life expectancy doesn't reach the 30's and drug dealers are kings. The tale of the City of God, and its myriad of characters is told by Rocket, a young man who struggles to make something of his life, other than to wind up another victim of drugs or gang wars. Not only are the characters in City of God absolutely fascinating, and also very endearing, but also convincingly acted by groups of young and unknown actors. The stoies are well-told, and at times, funny, and at others, brutally shocking. The cinematic style of the film gives a nod to Tarantino, with some clever time-jumping, freeze-framing, and texts indicating another chapter of the film. In every sense, a bit of a Brazillian Pulp Fiction or Goodfellas, but with its own unique flavour to it. The City of God is a marvel, and a highly recommended film to watch, but not recommended for the over-sensitive or easily distressed."

In [None]:
predict_coment(review2)

### Estrapolação Review do Filme Cats (2019).

[Link review](https://www.imdb.com/review/rw5342483/?ref_=tt_urv)


In [None]:
review3 = "This movie reminds me of that scene from Jurassic Park where Jeff Goldblum says You were so preoccupied with whether or not you could, you never bothered to ask if you should. This was hands down the most disturbingly awful movie I have ever seen. Whoever greenlit this should never be in charge of the light ever again. How dare they do this to me?!? Please don't go see this movie. And if you do, may God have mercy on your soul."

In [None]:
predict_coment(review3)

### Estrapolação Review do Transformers Last Knight (2017).

[Link review](https://www.imdb.com/review/rw3760494/?ref_=tt_urv)

In [None]:
review4 = "This movie is like a big story put up in one paragraph with no punctuation marks whatsoever. Usually Transformers builds up a simple story that explodes into one big final battle, filled with special effects and epic fights between autobots and decepticons; pretty entertaining to watch, especially if you're into big explosions- action-robots-movies and usually it works. The last knight is different from the beginning, it introduces way more story lines than it should, it tries to make a suspenseful plot with so many resources it just drags into a story that's full of holes and patches that it seems you're watching a lot of trailers from different movies with no connection between each other. The movie forces a lot of secondary roles (between the old and new characters) that it becomes boring and confusing at the same time. The main story becomes clear in the last quarter of the movie, just in time for the final battleby this point you're in fully overdrive mode trying to catch up with everything that's happened that it's barely enjoyable. Personally I think that the worst part is the lack of continuity, they jump from one scene to another in less than 30 seconds. This dynamicity makes it impossible to follow the main plot or even one story line. There are so many details and jumps between stories that it exhausting trying to connect the dots. I don't recommend it at all, not worth the money or time. Maybe if you're really curious or a big fan into the saga you may find it entertaining at some point, my only suggestion: lower your expectations, cause this is, by far, the worst Transformers movie."

In [None]:
predict_coment(review4)

In [None]:
y_test.value_counts()

In [None]:
pred = cls.predict(features_test)
pred = pd.DataFrame(pred)
pred = pred.replace(0,'positive')
pred = pred.replace(1,'negative')
pred

In [None]:
accuracy_score(y_test, pred)