In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

# Dados

In [2]:
df=pd.read_excel('teste_smarkio_lbs.xls', sheet_name='NLP')
print(df.shape)
df.head(10)

(518, 2)


Unnamed: 0,letra,artista
0,Jay-z Uh-uh-uh You ready b? Let's go get 'em. ...,Beyoncé
1,Your challengers are a young group from Housto...,Beyoncé
2,"Dum-da-de-da Do, do, do, do, do, do (Coming do...",Beyoncé
3,If I ain't got nothing I got you If I ain't go...,Beyoncé
4,Six inch heels She walked in the club like nob...,Beyoncé
5,(hello) hello How are you (oh) I just got to s...,Beyoncé
6,"Shoulders sideways, smack it, smack it in the ...",Beyoncé
7,"Clap, clap, clap like you don't care Ooh we b...",Beyoncé
8,"Shoulders sideways, smack it, smack it in the ...",Beyoncé
9,Do you think You could fall for a woman like m...,Beyoncé


# Pré-procesamento de texto
Preparando o dataset para a aplicação do algoritmo de naive Bayes

##### Exclusão de capitulares


In [3]:
df["letra"] = df["letra"].str.lower()
df["letra"].head(10)

0    jay-z uh-uh-uh you ready b? let's go get 'em. ...
1    your challengers are a young group from housto...
2    dum-da-de-da do, do, do, do, do, do (coming do...
3    if i ain't got nothing i got you if i ain't go...
4    six inch heels she walked in the club like nob...
5    (hello) hello how are you (oh) i just got to s...
6    shoulders sideways, smack it, smack it in the ...
7    clap, clap, clap like you don't care  ooh we b...
8    shoulders sideways, smack it, smack it in the ...
9    do you think you could fall for a woman like m...
Name: letra, dtype: object

##### Tokenização

In [4]:
#Contraindo apóstrofos e hífens para não serem separados na tokenização
df['letra'] = df['letra'].str.replace("'", "")
df['letra'] = df['letra'].str.replace("-", "")

In [5]:
tokenizer = nltk.RegexpTokenizer(r'[\w\']+')
df['letra'] = df['letra'].map(tokenizer.tokenize)
df['letra'].head(10)

0    [jayz, uhuhuh, you, ready, b, lets, go, get, e...
1    [your, challengers, are, a, young, group, from...
2    [dumdadeda, do, do, do, do, do, do, coming, do...
3    [if, i, aint, got, nothing, i, got, you, if, i...
4    [six, inch, heels, she, walked, in, the, club,...
5    [hello, hello, how, are, you, oh, i, just, got...
6    [shoulders, sideways, smack, it, smack, it, in...
7    [clap, clap, clap, like, you, dont, care, ooh,...
8    [shoulders, sideways, smack, it, smack, it, in...
9    [do, you, think, you, could, fall, for, a, wom...
Name: letra, dtype: object

##### Remoção de 'stopwords'

In [6]:
stopwords_en = stopwords.words('english')

In [7]:
df['letra']=df['letra'].apply(lambda x: [item for item in x if item not in stopwords_en])
df['letra'].head(10)

0    [jayz, uhuhuh, ready, b, lets, go, get, em, lo...
1    [challengers, young, group, houston, welcome, ...
2    [dumdadeda, coming, dripping, candy, ground, s...
3    [aint, got, nothing, got, aint, got, something...
4    [six, inch, heels, walked, club, like, nobodys...
5    [hello, hello, oh, got, say, hold, know, thing...
6    [shoulders, sideways, smack, smack, air, legs,...
7    [clap, clap, clap, like, dont, care, ooh, frea...
8    [shoulders, sideways, smack, smack, air, legs,...
9    [think, could, fall, woman, like, cause, find,...
Name: letra, dtype: object

##### 'Stemming' para a obtenção da raiz das palavras

In [8]:
stemmer=nltk.stem.SnowballStemmer('english')
df['letra'] = df['letra'].apply(lambda x: [stemmer.stem(y) for y in x])
df['letra'].head(10)

0    [jayz, uhuhuh, readi, b, let, go, get, em, loo...
1    [challeng, young, group, houston, welcom, beyo...
2    [dumdadeda, come, drip, candi, ground, stay, y...
3    [aint, got, noth, got, aint, got, someth, dont...
4    [six, inch, heel, walk, club, like, nobodi, bu...
5    [hello, hello, oh, got, say, hold, know, thing...
6    [shoulder, sideway, smack, smack, air, leg, mo...
7    [clap, clap, clap, like, dont, care, ooh, frea...
8    [shoulder, sideway, smack, smack, air, leg, mo...
9    [think, could, fall, woman, like, caus, find, ...
Name: letra, dtype: object

##### Vetorização
Gerando uma matriz TD-IDF

In [9]:
def preprocessing_text(textos):
    return textos
vectorizer = TfidfVectorizer(
    tokenizer=preprocessing_text,
    preprocessor=preprocessing_text,
    token_pattern=None) 


In [10]:
X = vectorizer.fit_transform(df['letra'])
y=df['artista'].values

##### Entrada X

In [11]:
pd.DataFrame(X.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6409,6410,6411,6412,6413,6414,6415,6416,6417,6418
0,0.0,0.055606,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.14249,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,0.0,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
514,0.0,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
515,0.0,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
516,0.0,0.000000,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### Saída y

In [12]:
pd.DataFrame(y).head()

Unnamed: 0,0
0,Beyoncé
1,Beyoncé
2,Beyoncé
3,Beyoncé
4,Beyoncé


# 5. Crie um classificador, a partir da segunda aba - NLP do arquivo de dados, que permita identificar qual trecho de música corresponde às respectivas artistas listadas (Sugestão: Naive Bayes Classifier).

52.9% das letras são do rótulo mais frequente (Beyoncé), portanto, só fará sentido um classificador que acerte pelo menos 52.9% das previsões.

In [13]:
artistas=df['artista'].value_counts()/df.shape[0]
print(artistas)

Beyoncé    0.528958
Rihanna    0.471042
Name: artista, dtype: float64


Split dos dados

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=111)

### Naive Bayes

In [15]:
nb_classifier=BernoulliNB()
nb_classifier.fit(X_train, y_train)
nb_predict = nb_classifier.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predict)
nb_cm = confusion_matrix(y_test, nb_predict)

In [16]:
print("Matriz de confusão: Observados (linhas) x Classificados (colunas)")

pd.DataFrame({'Beyoncé':nb_cm[0], 'Rihanna':nb_cm[1]}, 
            index=['Beyoncé', 'Rihanna'])


Matriz de confusão: Observados (linhas) x Classificados (colunas)


Unnamed: 0,Beyoncé,Rihanna
Beyoncé,39,6
Rihanna,46,65


In [17]:
print("Acurácia: " + str(np.round(nb_accuracy,4)))

Acurácia: 0.6667


### SVM

In [18]:
svm_classifier=SVC(random_state=111)
svm_classifier.fit(X_train, y_train)
svm_predict = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predict)
svm_cm = confusion_matrix(y_test, svm_predict)

In [19]:
print("Matriz de confusão: Observados (linhas) x Classificados (colunas)")

pd.DataFrame({'Beyoncé':svm_cm[0], 'Rihanna':svm_cm[1]}, 
            index=['Beyoncé', 'Rihanna'])


Matriz de confusão: Observados (linhas) x Classificados (colunas)


Unnamed: 0,Beyoncé,Rihanna
Beyoncé,74,37
Rihanna,11,34


In [20]:
print("Acurácia: " + str(np.round(svm_accuracy,4)))

Acurácia: 0.6923


### Random forest

In [21]:
rf_classifier=RandomForestClassifier(random_state=111)
rf_classifier.fit(X_train, y_train)
rf_predict = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predict)
rf_cm = confusion_matrix(y_test, rf_predict)

In [22]:
print("Matriz de confusão: Observados (linhas) x Classificados (colunas)")

pd.DataFrame({'Beyoncé':rf_cm[0], 'Rihanna':rf_cm[1]}, 
            index=['Beyoncé', 'Rihanna'])

Matriz de confusão: Observados (linhas) x Classificados (colunas)


Unnamed: 0,Beyoncé,Rihanna
Beyoncé,63,27
Rihanna,22,44


In [23]:
print("Acurácia: " + str(np.round(rf_accuracy,4)))

Acurácia: 0.6859


### Redes neurais

In [24]:
rn_classifier=MLPClassifier(random_state=111)
rn_classifier.fit(X_train, y_train)
rn_predict = rn_classifier.predict(X_test)
rn_accuracy = accuracy_score(y_test, rn_predict)
rn_cm = confusion_matrix(y_test, rn_predict)

In [25]:
print("Matriz de confusão: Observados (linhas) x Classificados (colunas)")

pd.DataFrame({'Beyoncé':rn_cm[0], 'Rihanna':rn_cm[1]}, 
            index=['Beyoncé', 'Rihanna'])

Matriz de confusão: Observados (linhas) x Classificados (colunas)


Unnamed: 0,Beyoncé,Rihanna
Beyoncé,59,23
Rihanna,26,48


In [26]:
print("Acurácia: " + str(np.round(rn_accuracy,4)))

Acurácia: 0.6859


## Comparação

O algoritmo de melhor acurácia, com 69.29% de acertos, foi o SVM. Tentaremos melhorá-lo "tunando" os hiperparâmetros.

In [27]:
pd.DataFrame({'Acurácia':[np.round(nb_accuracy,4),
              np.round(svm_accuracy,4),
              np.round(rf_accuracy,4),
              np.round(rn_accuracy,4)]
             },
            index=['Naive bayes', 'SVM', 'Random forest', 'Redes neurais'])

Unnamed: 0,Acurácia
Naive bayes,0.6667
SVM,0.6923
Random forest,0.6859
Redes neurais,0.6859


## Grid search para tuning dos parâmetros

In [28]:
parameters = {'kernel':('linear', 'rbf', 'sigmoid'), 
              'C':[0.1, 0.3, 0.5, 1, 3, 10], 
              'gamma':['scale',0.1, 0.3, 0.5, 1, 3, 10]}
grid = GridSearchCV(svm_classifier, parameters)
grid_fit = grid.fit(X, y)

In [29]:
best_params = grid_fit.best_params_
print("Melhores parâmetros:\n" + str(best_params))

Melhores parâmetros:
{'C': 3, 'gamma': 0.3, 'kernel': 'sigmoid'}


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10000)
svm_classifier=SVC(C=3, gamma=0.3, kernel='sigmoid')
svm_classifier.fit(X_train, y_train)
svm_predict = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predict)
svm_cm = confusion_matrix(y_test, svm_predict)

In [31]:
print("Acurácia:" + str(np.round(svm_accuracy,4)))

Acurácia:0.7244


## Salvando o modelo

Após a melhora acima, podemos colocar em produção um modelo com 72.44% de acurácia.

In [32]:
import pickle
pkl_filename = "mod_final.pkl"

with open(pkl_filename, 'wb') as file:
    pickle.dump(svm_classifier, file)