<a href="https://colab.research.google.com/github/rodrigorenemenegazzo/rodrigorenemenegazzo/blob/main/Text_Mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Text Mining**

"Text mining (also known as text analysis), is the process of transforming unstructured text into structured data for easy analysis. Text mining uses natural language processing (NLP), allowing machines to understand the human language and process it automatically." [Source](https://monkeylearn.com/text-mining/#:~:text=Text%20mining%20(also%20known%20as,language%20and%20process%20it%20automatically.)

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Tue May  3 19:47:22 2022

@author: Rodrigo Rene Menegazzo
"""
# enunciado exercicio
# https://www.tads.ufpr.br/pluginfile.php/18895/mod_resource/content/0/04_text_mining.pdf

In [20]:
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd 

# Load the book
df_train = pd.read_csv("/content/gdrive/MyDrive/Especializacao IA UFPR/IAAA016 - Topicos em IA/Harrypotter_livro.txt", 
  header=0, #contagem de linhas 
  delimiter="\t",  
  quoting=3)

print(df_train)
df_train.info()

print('Data Frame columns: ', df_train.columns.values)

In [None]:
#PreProcessing data(cleaning data and get the words of interest) and removing the stop words
# https://www.kaggle.com/code/blurredmachine/bag-of-words-meets-random-forest/notebook

from bs4 import BeautifulSoup
import re

#Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords # Import the stop word list

training_data_size = df_train['DADOS DE COPYRIGHT'].size
print(training_data_size)

#Funçao para limpar os dados e extrair apenas as palavras de interesse
def clean_text_data(data_point, data_size):
    #PreProcessing remove tags etc html code
    review_soup = BeautifulSoup(data_point)
    #Pega as palavras
    review_text = review_soup.get_text()
    review_letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    #tranforma todas as palavras em letra minuscula
    review_lower_case = review_letters_only.lower()  
    review_words = review_lower_case.split() 
    #Mostra stopwords da lingua portuguesa
    stop_words = stopwords.words("portuguese")
    meaningful_words = [x for x in review_words if x not in stop_words]
    
    if( (i)%2000 == 0 ):
        print("Cleaned %d of %d data (%d %%)." % ( i, data_size, ((i)/data_size)*100))
        
    return( " ".join(meaningful_words))

#Cabeçalho dos dados de treinamento
df_train.head()

#Limpando os dados, removendo stop words em portugues
for i in range(training_data_size):
    df_train['DADOS DE COPYRIGHT'][i] = clean_text_data(df_train['DADOS DE COPYRIGHT'][i], training_data_size)
print("Cleaning completed!")

print(df_train)


The countVectorizer prints the identified Unique words along with their indices. The numbers do not represent the count of the words, but **the position of the words** in the matrix.


In [29]:
#Representação vetorial das setenças/palavras 
# https://www.geeksforgeeks.org/using-countvectorizer-to-extracting-features-from-text/

#Import countVectorizer and create
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

#Print the position of the words in the matrix
vectorizer.fit(df_train['DADOS DE COPYRIGHT']) 
print("Vocabulary: ", vectorizer.vocabulary_)


Vocabulary:  {'sobre': 11897, 'obra': 8983, 'presente': 10168, 'disponibilizada': 4394, 'equipe': 5198, 'le': 7752, 'livros': 7955, 'diversos': 4443, 'parceiros': 9369, 'objetivo': 8976, 'oferecer': 9101, 'conte': 3118, 'uso': 12929, 'parcial': 9371, 'pesquisas': 9782, 'estudos': 5667, 'acad': 129, 'micos': 8435, 'bem': 1784, 'simples': 11838, 'teste': 12424, 'qualidade': 10503, 'fim': 6144, 'exclusivo': 5775, 'compra': 2809, 'futura': 6432, 'expressamente': 5868, 'proibida': 10320, 'totalmente': 12621, 'repud': 11035, 'avel': 1563, 'venda': 13069, 'aluguel': 685, 'quaisquer': 10502, 'comercial': 2738, 'disponibilizam': 4395, 'dominio': 4517, 'publico': 10435, 'propriedade': 10358, 'intelectual': 7409, 'forma': 6263, 'gratuita': 6699, 'acreditar': 295, 'conhecimento': 2996, 'educa': 4606, 'devem': 4241, 'ser': 11751, 'acess': 184, 'veis': 13034, 'livres': 7953, 'toda': 12509, 'qualquer': 10508, 'pessoa': 9783, 'voc': 13319, 'pode': 9906, 'encontrar': 4853, 'obras': 8984, 'site': 11870,

The numbers in bracket are the index of the value in the matrix (row, column) and 1 is the value(The number of times a term appeared in the document represented by the row of the matrix). For example, the two first rows represents the row 0, the next 8 brackets, represents the row 1.

In [None]:
#Count words and others things
# https://investigate.ai/text-analysis/counting-words-with-scikit-learns-countvectorizer/

count_words = vectorizer.fit_transform(df_train['DADOS DE COPYRIGHT'])
print("Contagem das palavras: \n", count_words)

#Nome de todas as palavras armazenadas
print("Todas as palavras armazenadas: \n", vectorizer.get_feature_names_out())

In [36]:
# Encode the Document
vector = vectorizer.transform(df_train['DADOS DE COPYRIGHT'])

# Summarizing the Encoded Texts
print("Encoded Document is:")
vector_array = vector.toarray()
print(vector_array)
print("Shape: ", vector_array.shape)

Encoded Document is:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Shape:  (14332, 13427)


In [None]:
# https://www.askpython.com/python/examples/plot-k-means-clusters-python
#Importing required modules
 
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
 
#Load Data
data = load_digits().data
pca = PCA(2)
 
#Transform the data
df = pca.fit_transform(data)
 
df.shape

In [40]:
# Regras de associação, clusterização com k-means
# http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

#Clustering
# https://stackoverflow.com/questions/54939424/plotting-vectorized-text-documents-in-matplotlib

from sklearn.cluster import KMeans
km = KMeans(
    n_clusters=3, 
    init='k-means++', 
    max_iter=500)

label = kmeans.fit_predict(vector_array)
 
print(label)
#km.fit(vector_array)

In [43]:
type(vector_array)

numpy.ndarray

In [42]:
#Plotting K-means clusters
import numpy as np
import matplotlib.pyplot as plt

u_labels = np.unique(label)
 
#plotting the results:
for i in u_labels:
    plt.scatter(df[label == i , 0] , df[label == i , 1] , label = i)
plt.legend()
plt.show()

NameError: ignored

In [None]:
#Dimensionality Reduction with PCA then plot the two most important principle components (the first two).
from sklearn.decomposition import PCA

# First: for every document we get its corresponding cluster
clusters = km.predict(vector_array)

# We train the PCA on the dense version of the tf-idf. 
pca = PCA(n_components=2)
two_dim = pca.fit_transform(vector_array.todense())

scatter_x = two_dim[:, 0] # first principle component
scatter_y = two_dim[:, 1] # second principle component













#----------------------
#Complex way to count words
matrix = vectorizer.fit_transform(df_train['DADOS DE COPYRIGHT'])
counts = pd.DataFrame(matrix.toarray(),
                      columns=vectorizer.get_feature_names())

# Show us the top 10 most common words
counts.T.sort_values(by=0, ascending=False).head(10)


# (db.scam?)


# regras de associação (clusterização com k-means?)
# http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

# contagem de frequência das palavras, manual. 


#visualize os vetores com a pca.



