# **CLASSIFICAÇÃO DE PRODUTOS DE UM E-COMMERCE POR CATEGORIA**


**OBJETIVO:**  


1. Utilizar etapas do método CRISP
2. Realizar pré-processamento de todas as features do Dataset fornecido
3. Desenvolver uma solução de machine learning que traga valor o cliente;
4. Criar um algoritmo capaz de classificar os produtos em diferentes categorias, baseado em dados descritivos de produtos digitais (no caso 5 categorias).

---


**DATASET USADO:** products_dataset.csv

# INSTALANDO REQUIREMENTS
---

In [1]:
#!pip install -r "C:\Projeto_Final_MineracaoDados\requirements.txt"

# BIBLIOTECAS A SEREM INSTALADAS
---

In [2]:
!pip install unidecode



# AQUISIÇÃO DOS DADOS
---

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv(r"\Users\Rodrigo Mendes\Desktop\products_dataset.csv")

# EXPLORAÇÃO DOS DADOS
---

In [5]:
df.head()

Unnamed: 0,product_id,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,11394449,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,Decoração
1,15534262,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,2018-04-04 20:55:07,77.67,8.0,1,5,124,,Papel e Cia
2,16153119,9835835,expositor de esmaltes,1,38,Organizador expositor p/ 70 esmaltes,expositor,2018-10-13 20:57:07,73.920006,2709.0,1,1,59,,Outros
3,15877252,8071206,medidas lencol para berco americano,1,6,Jogo de Lençol Berço Estampado,t jogo lencol menino lencol berco,2017-02-27 13:26:03,118.770004,0.0,1,1,180,1.0,Bebê
4,15917108,7200773,adesivo box banheiro,3,38,ADESIVO BOX DE BANHEIRO,adesivo box banheiro,2017-05-09 13:18:38,191.81,507.0,1,6,34,,Decoração


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38507 entries, 0 to 38506
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   product_id         38507 non-null  int64  
 1   seller_id          38507 non-null  int64  
 2   query              38507 non-null  object 
 3   search_page        38507 non-null  int64  
 4   position           38507 non-null  int64  
 5   title              38507 non-null  object 
 6   concatenated_tags  38505 non-null  object 
 7   creation_date      38507 non-null  object 
 8   price              38507 non-null  float64
 9   weight             38449 non-null  float64
 10  express_delivery   38507 non-null  int64  
 11  minimum_quantity   38507 non-null  int64  
 12  view_counts        38507 non-null  int64  
 13  order_counts       18117 non-null  float64
 14  category           38507 non-null  object 
dtypes: float64(3), int64(7), object(5)
memory usage: 4.4+ MB


In [7]:
df.category.unique()

array(['Decoração', 'Papel e Cia', 'Outros', 'Bebê', 'Lembrancinhas',
       'Bijuterias e Jóias'], dtype=object)

In [8]:
df.shape

(38507, 15)

In [9]:
#PRÉ-PROCESSAMENTO DOS DADOS
---

SyntaxError: invalid syntax (<ipython-input-9-fad4fb920e28>, line 2)

## Transformar letras em minúsculas

In [10]:
df['query'] = df['query'].str.lower()
df['title'] = df['title'].str.lower()
df['concatenated_tags'] = df['concatenated_tags'].str.lower()

## Remoção de Acentos

In [11]:
import unidecode

df['title'] = df['title'].apply(lambda x: unidecode.unidecode(x))
df['query'] = df['query'].apply(lambda x: unidecode.unidecode(x))
df['concatenated_tags'] = df['concatenated_tags'].apply(lambda x: unidecode.unidecode(str(x)))

## Remoção de Símbolos

In [12]:
import re
#'s	(	)	,	?	[	]
df['title'] = df['title'].apply(lambda x: re.sub('\|\?|\.|\!|\/|\;|\:|\+|\-|\'|\(|\)|\,|\[|\]|\?|\_', '', x))
df['query'] = df['query'].apply(lambda x: re.sub('\|\?|\.|\!|\/|\;|\:|\+|\-|\'|\(|\)|\,|\[|\]|\?|\_', '', x))
df['concatenated_tags'] = df['concatenated_tags'].apply(lambda x: re.sub('\|\?|\.|\!|\/|\;|\:|\+|\-|\'|\(|\)|\,|\[|\]|\?|\_', '', str(x)))

## Remoção de Números

In [13]:
df['title'] = df['title'].apply(lambda x: re.sub(r'[0-9]+', '', x))
df['query'] = df['query'].apply(lambda x: re.sub(r'[0-9]+', '', x))
df['concatenated_tags'] = df['concatenated_tags'].apply(lambda x: re.sub(r'[0-9]+', '', str(x)))

## Remoção palavras de ligação (Stopwords)

In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Rodrigo
[nltk_data]     Mendes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Rodrigo
[nltk_data]     Mendes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
def RemovePalavrasLigacao (texto):

  """Esta função realiza a remoção das palavras de ligação (Stopwords) de Texto (String)"""

  palavras_ligacao = set(stopwords.words('portuguese'))   
  texto_tokenizado = word_tokenize(texto) 
    
  texto_filtrado = [w for w in texto_tokenizado if not w in palavras_ligacao]   
  texto_filtrado = [] 
    
  for w in texto_tokenizado: 
      if w not in palavras_ligacao: 
          texto_filtrado.append(w) 
          
  return texto_filtrado

In [16]:
df['title'] = df['title'].apply(lambda x: RemovePalavrasLigacao(x))
df['query'] = df['query'].apply(lambda x: RemovePalavrasLigacao(x))
df['concatenated_tags'] = df['concatenated_tags'].apply(lambda x: RemovePalavrasLigacao(str(x)))

Salvando coluna de 'concatenated_tags' para uso posterior

In [17]:
col_concat = df['concatenated_tags'].copy()

##Juntando Palavras Processadas

In [18]:
df['title'] = df['title'].apply(lambda x: " ".join(x))
df['query'] = df['query'].apply(lambda x: " ".join(x))
df['concatenated_tags'] = df['concatenated_tags'].apply(lambda x: " ".join(x))

## Testando similaridades entre as strings das Colunas

In [19]:
from difflib import SequenceMatcher

In [20]:
def sml(x,y):
    return SequenceMatcher(None, x, y).ratio()

### Similaridade entre 'title' e 'query'

In [21]:
import numpy as np

In [22]:
lista_comparacao = np.array([])

for i in range(0,len(df.title)):
  a = sml(df['title'][i], df['query'][i])
  lista_comparacao = np.append(lista_comparacao,a)
df['sim_title_query'] = lista_comparacao

### Similaridade entre 'title' e 'concatenated_tags'

In [23]:
lista_comparacao = np.array([])

for i in range(0,len(df.title)):
  a = sml(df['title'][i], df['concatenated_tags'][i])
  lista_comparacao = np.append(lista_comparacao,a)
df['sim_title_concat'] = lista_comparacao

### Similaridade entre 'query' e 'concatenated_tags'

In [24]:
lista_comparacao = np.array([])

for i in range(0,len(df.title)):
  a = sml(df['query'][i], df['concatenated_tags'][i])
  lista_comparacao = np.append(lista_comparacao,a)
df['sim_query_concat'] = lista_comparacao

## Criando linhas a partir dos itens da coluna 'concatenated_tags'

In [25]:
df['concatenated_tags'] = col_concat

In [26]:
df = df.explode('concatenated_tags').reset_index().drop('index', axis=1)

## Transformando creation_date em Date Time e usando apenas ano

In [27]:
df['creation_year'] = df['creation_date'].str[:4]

## Correlação entre as colunas

In [28]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,product_id,seller_id,search_page,position,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,sim_title_query,sim_title_concat,sim_query_concat
product_id,1.0,-0.01,-0.0,-0.0,0.0,-0.01,0.0,0.01,0.02,0.04,-0.01,-0.04,-0.03
seller_id,-0.01,1.0,-0.02,-0.02,-0.0,0.01,-0.01,0.01,0.03,0.05,0.0,0.01,0.03
search_page,-0.0,-0.02,1.0,0.07,0.0,-0.03,0.03,0.05,-0.03,-0.02,-0.1,0.01,-0.03
position,-0.0,-0.02,0.07,1.0,0.01,-0.01,-0.0,0.01,-0.07,-0.07,-0.13,0.04,0.02
price,0.0,-0.0,0.0,0.01,1.0,0.24,-0.04,-0.11,-0.02,-0.05,-0.02,0.03,0.04
weight,-0.01,0.01,-0.03,-0.01,0.24,1.0,-0.04,-0.05,0.02,0.02,-0.05,-0.02,-0.01
express_delivery,0.0,-0.01,0.03,-0.0,-0.04,-0.04,1.0,0.1,-0.08,-0.08,0.07,0.0,0.0
minimum_quantity,0.01,0.01,0.05,0.01,-0.11,-0.05,0.1,1.0,0.03,0.04,-0.01,-0.06,-0.05
view_counts,0.02,0.03,-0.03,-0.07,-0.02,0.02,-0.08,0.03,1.0,0.67,-0.03,-0.32,-0.28
order_counts,0.04,0.05,-0.02,-0.07,-0.05,0.02,-0.08,0.04,0.67,1.0,-0.06,-0.32,-0.3


## Excluindo de colunas

In [29]:
df.drop(['query','title','creation_date'], axis=1, inplace=True)

## Preenchendo valores NaN  ""

In [30]:
df.fillna(0,inplace = True)

## Transformando colunas "concatenated_tags" e "creation_year" de object para string

In [31]:
df['concatenated_tags'] = df['concatenated_tags'].astype('str')
df['creation_year'] = df['creation_year'].astype('int')

## Selecionando dados inferiores a 2015

In [32]:
df2 = df.copy()
df2 = df2[df2['creation_year']<2015]

In [33]:
df2['creation_year'] = df2['creation_year'].astype('str')

## One Hot Encoding (Get Dummies)

In [34]:
col_dummies = ["concatenated_tags","creation_year"]

In [35]:
df2 = pd.get_dummies(df2, columns=col_dummies)  

## Separando dados de treino e teste

In [36]:
from sklearn.model_selection import train_test_split

label = df2["category"]
feat_cols = df2.columns.tolist()
feat_cols.remove("category")
feat = df2[feat_cols]

X_train, X_test, y_train, y_test = train_test_split(feat, label, random_state=1)

## Normalizando dados

In [37]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_ = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

## Balanceando os dados

In [38]:
from imblearn.over_sampling import SMOTE

sm = SMOTE()
X_, y_ = sm.fit_resample(X_, y_train)

# MODELAGEM [XGBOOST]
---

## Criação do Modelo

In [None]:
import xgboost as xgb

clf_XGB = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=15,
    scale_pos_weight=100_000,
    use_label_encoder=True
)
clf_XGB.fit(X_, y_)



Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




## Validação do Modelo

In [None]:
from sklearn.metrics import plot_confusion_matrix
from matplotlib import pyplot as plt

fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix(clf_XGB, X_test, y_test, normalize="true", ax=ax)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = clf_XGB.predict(X_test)
print("Report do Classificador Xgboost:\n")
print(classification_report(y_true = y_test,y_pred = y_pred,digits = 4))

# MODELAGEM [DECISION TREE]
---

## Criação do Modelo

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf_DTR = DecisionTreeClassifier()
clf_DTR.fit(X_,y_)

## Validação do Modelo

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix(clf_DTR, X_test, y_test, normalize="true", ax=ax)

In [None]:
y_pred = clf_DTR.predict(X_test)
print("Report do Classificador Decision Tree:\n")
print(classification_report(y_true = y_test,y_pred = y_pred,digits = 4))

#MODELAGEM [RANDOM FOREST]
---

## Criação do Modelo

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_RFC = RandomForestClassifier()
clf_RFC.fit(X_,y_)

## Validação do Modelo

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix(clf_RFC, X_test, y_test, normalize="true", ax=ax)

In [None]:
y_pred = clf_RFC.predict(X_test)
print("Report do Classificador Random Forest:\n")
print(classification_report(y_true = y_test,y_pred = y_pred,digits = 4))

#MODELAGEM [SVM]
---

## Criação do Modelo

In [None]:
from sklearn.svm import SVC 

clf_SVM = SVC()
clf_SVM.fit(X_,y_)

## Validação do Modelo

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix(clf_SVM, X_test, y_test, normalize="true", ax=ax)

In [None]:
y_pred = clf_SVM.predict(X_test)
print("Report do Classificador SVM:\n")
print(classification_report(y_true = y_test,y_pred = y_pred,digits = 4))

# MODELAGEM [LOGISTIC REGRESSION]
---

## Criação do Modelo

In [None]:
from sklearn.linear_model import LogisticRegression

clf_RLG = LogisticRegression()  
clf_RLG.fit(X_,y_) 

## Validação do Modelo

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix(clf_RLG, X_test, y_test, normalize="true", ax=ax)

In [None]:
y_pred = clf_RLG.predict(X_test)
print("Report do Classificador Regressão Logística Múltipla:\n")
print(classification_report(y_true = y_test,y_pred = y_pred,digits = 4))

#ATUALIZAÇÃO DE REQUIREMENTS
---

In [None]:
#!python --version

In [None]:
#!pip freeze > "C:\Projeto_Final_MineracaoDados\requirements.txt"

In [None]:
df2.head()