<a href="https://colab.research.google.com/github/rodrigorenemenegazzo/rodrigorenemenegazzo/blob/main/Text_Mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Text Mining**

"Text mining (also known as text analysis), is the process of transforming unstructured text into structured data for easy analysis. Text mining uses natural language processing (NLP), allowing machines to understand the human language and process it automatically." [Source](https://monkeylearn.com/text-mining/#:~:text=Text%20mining%20(also%20known%20as,language%20and%20process%20it%20automatically.)

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Tue May  3 19:47:22 2022

@author: Rodrigo
"""
# enunciado exercicio
# https://www.tads.ufpr.br/pluginfile.php/18895/mod_resource/content/0/04_text_mining.pdf

import pandas as pd     
import numpy as np
from bs4 import BeautifulSoup
import re

#Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords # Import the stop word list

#Import countVectorizer and create
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

#Random Forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# Load the book
df_train = pd.read_csv("Harrypotter_livro.txt", 
                              header=0, #contagem de linhas 
                              delimiter="\t",  
                              quoting=3)


print(df_train.shape)
df_train.info()

print(df_train.columns.values)
print(df_train)

#PreProcessing data for all of the training data
# https://www.kaggle.com/code/blurredmachine/bag-of-words-meets-random-forest/notebook


training_data_size = df_train['DADOS DE COPYRIGHT'].size
print(training_data_size)

#Funçao para limpar os dados e extrair apenas as palavras de interesse
def clean_text_data(data_point, data_size):
    #PreProcessing remove tags etc html code
    review_soup = BeautifulSoup(data_point)
    #Pega as palavras
    review_text = review_soup.get_text()
    review_letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    #tranforma todas as palavras em letra minuscula
    review_lower_case = review_letters_only.lower()  
    review_words = review_lower_case.split() 
    #Mostra stopwords da lingua portuguesa
    stop_words = stopwords.words("portuguese")
    meaningful_words = [x for x in review_words if x not in stop_words]
    
    if( (i)%2000 == 0 ):
        print("Cleaned %d of %d data (%d %%)." % ( i, data_size, ((i)/data_size)*100))
        
    return( " ".join(meaningful_words))

#Cabeçalho dos dados de treinamento
df_train.head()

#Limpando os dados, removendo stop words em portugues
for i in range(training_data_size):
    df_train['DADOS DE COPYRIGHT'][i] = clean_text_data(df_train['DADOS DE COPYRIGHT'][i], training_data_size)
print("Cleaning completed!")
print(df_train)

#Representação vetorial das setenças/palavras 
# https://www.geeksforgeeks.org/using-countvectorizer-to-extracting-features-from-text/
vectorizer.fit(df_train['DADOS DE COPYRIGHT']) 
# Printing the identified Unique words along with their indices
#The numbers do not represent the count of the words but the position of the words in the matrix
print("Vocabulary: ", vectorizer.vocabulary_)

#Count words and others things
# https://investigate.ai/text-analysis/counting-words-with-scikit-learns-countvectorizer/
count_words = vectorizer.fit_transform(df_train['DADOS DE COPYRIGHT'])
print("\n The numbers in bracket are the index of the value in the matrix (row, column) and 1 is the value(The number of times a term appeared in the document represented by the row of the matrix) \n", 
      "\n Contagem das palavras: \n", count_words)

#Nome de todas as palavras armazenadas
print(vectorizer.get_feature_names())


# Encode the Document to be trained for some AI algorithm 
vector = vectorizer.transform(df_train['DADOS DE COPYRIGHT'])
# Summarizing the Encoded Texts
print("Encoded Document is:")
vector_array = vector.toarray()
print(vector_array)
print(vector_array.shape)

# Regras de associação, clusterização com k-means
# http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/
#Clustering
# https://stackoverflow.com/questions/54939424/plotting-vectorized-text-documents-in-matplotlib
from sklearn.cluster import KMeans
km = KMeans(
    n_clusters=3, 
    init='k-means++', 
    max_iter=500)
km.fit(vector_array)

#Dimensionality Reduction with PCA then plot the two most important principle components (the first two).
from sklearn.decomposition import PCA

# First: for every document we get its corresponding cluster
clusters = km.predict(vector_array)

# We train the PCA on the dense version of the tf-idf. 
pca = PCA(n_components=2)
two_dim = pca.fit_transform(vector_array.todense())

scatter_x = two_dim[:, 0] # first principle component
scatter_y = two_dim[:, 1] # second principle component













#----------------------
#Complex way to count words
matrix = vectorizer.fit_transform(df_train['DADOS DE COPYRIGHT'])
counts = pd.DataFrame(matrix.toarray(),
                      columns=vectorizer.get_feature_names())

# Show us the top 10 most common words
counts.T.sort_values(by=0, ascending=False).head(10)


# (db.scam?)


# regras de associação (clusterização com k-means?)
# http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

# contagem de frequência das palavras, manual. 


#visualize os vetores com a pca.



