In [18]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

In [16]:
# Path para o arquivo train.csv
data = pd.read_csv('bases/train.csv')

data[0:10]

Unnamed: 0,content_review,sentiment_analysis,domain_name,label
0,"I love the app, I have used it for years The o...",Positive,PhotosVideos,Usa
1,It would be better if they actually released t...,Positive,PhotosVideos,Mis
2,"Needs to be an option to pay monthly, or to op...",Negative,PhotosVideos,Mis
3,Please let there be a one week free trial I wo...,Positive,PhotosVideos,Mis
4,Can\'t connect to server. Failed big time. Ple...,Negative,PhotosVideos,Sup
5,I Just got my new phone and the app asked for ...,Negative,PhotosVideos,Mis
6,I've been having keep safe since it came out a...,Negative,PhotosVideos,"Sup,Usa"
7,It keeps saying it cannot connect to server. I...,Negative,PhotosVideos,Sup
8,Logged in from a new device as a premium membe...,Neutral,PhotosVideos,"Usa,Sup"
9,Or syncing with the laptop. Although the pic o...,Neutral,PhotosVideos,Sup


In [19]:
labels  = []

for i in data['label']:
    a = i.split(',')
    labels.append(a)

mlb = MultiLabelBinarizer(classes=("Dep", "Per", "Sup", "Usa", "Mis"))
labelsdf = pd.DataFrame(mlb.fit_transform(labels), columns=['Dep', 'Per', 'Sup', 'Usa', 'Mis']) 
data = data.assign(label_dep=labelsdf['Dep'].values, label_per=labelsdf['Per'].values, label_sup=labelsdf['Sup'].values, label_usa=labelsdf['Usa'].values, label_mis=labelsdf['Mis'].values)
data

Unnamed: 0,content_review,sentiment_analysis,domain_name,label,label_dep,label_per,label_sup,label_usa,label_mis
0,"I love the app, I have used it for years The o...",Positive,PhotosVideos,Usa,0,0,0,1,0
1,It would be better if they actually released t...,Positive,PhotosVideos,Mis,0,0,0,0,1
2,"Needs to be an option to pay monthly, or to op...",Negative,PhotosVideos,Mis,0,0,0,0,1
3,Please let there be a one week free trial I wo...,Positive,PhotosVideos,Mis,0,0,0,0,1
4,Can\'t connect to server. Failed big time. Ple...,Negative,PhotosVideos,Sup,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
5995,"I followed all the instructions airplane mode,...",Neutral,Lifestyle,Mis,0,0,0,0,1
5996,What a disappointment for 4.99 Ugh,Negative,Lifestyle,Mis,0,0,0,0,1
5997,I hate it ... this app does not work.,Negative,Lifestyle,Usa,0,0,0,1,0
5998,"Im at 28 weeks, and I was able to listen to my...",Positive,Lifestyle,Mis,0,0,0,0,1


In [21]:
len(data)

6000

In [22]:
labels = data.iloc[:, 4:].columns.values

for l in labels:
  print("Label: ", l)
  print(data[l].value_counts(), '\n')

Label:  label_dep
0    4967
1    1033
Name: label_dep, dtype: int64 

Label:  label_per
0    5885
1     115
Name: label_per, dtype: int64 

Label:  label_sup
0    5507
1     493
Name: label_sup, dtype: int64 

Label:  label_usa
0    4882
1    1118
Name: label_usa, dtype: int64 

Label:  label_mis
1    3631
0    2369
Name: label_mis, dtype: int64 



In [36]:
import re

def clean_text(text):
    
    text = text.encode('ascii', errors = 'ignore').decode() 
    text = text.lower() 
    text = re.sub(r'http\S+', ' ', text) 
    text = re.sub(r'#+', ' ', text)
    text = re.sub(r'@[A-Za-z0-9]+', ' ', text)
    text = re.sub(r"([A-Za-z]+)'s", r"\1 is", text)
    text = re.sub(r"what's", "what is ", text) 
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"won't", "will not ", text) 
    text = re.sub(r"\'ve", " have ", text) 
    text = re.sub(r"can't", "can not ", text) 
    text = re.sub(r"n't", " not ", text) 
    text = re.sub(r"isn't", "is not ", text) 
    text = re.sub(r"i'm", "i am ", text) 
    text = re.sub(r"\'re", " are ", text) 
    text = re.sub(r"\'d", " would ", text) 
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = text.strip(' ') 
    
    return text

In [37]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

def remove_stopwords(text):
    
  stop_words = set(stopwords.words('english')) 

  word_tokens = word_tokenize(text) 

  filtered_sentence = [w for w in word_tokens if not w in stop_words] 
      
  return filtered_sentence

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\steph\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\steph\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
from tqdm import tqdm 

tqdm.pandas(desc='Limpando o texto')
data['text_tokens'] = data['content_review'].progress_apply(clean_text)

tqdm.pandas(desc='Removendo as stopwords e tokenizando o texto')
data['text_tokens'] = data['text_tokens'].progress_apply(remove_stopwords)

data[['content_review', 'text_tokens']].head()

Limpando o texto: 100%|█████████████████████████████████████████████████████████| 6000/6000 [00:00<00:00, 13213.99it/s]
Removendo as stopwords e tokenizando o texto: 100%|██████████████████████████████| 6000/6000 [00:03<00:00, 1677.96it/s]


Unnamed: 0,content_review,text_tokens
0,"I love the app, I have used it for years The o...","[love, app, used, years, problem, new, update,..."
1,It would be better if they actually released t...,"[would, better, actually, released, messages, ..."
2,"Needs to be an option to pay monthly, or to op...","[needs, option, pay, monthly, open, deleted, p..."
3,Please let there be a one week free trial I wo...,"[please, let, one, week, free, trial, would, g..."
4,Can\'t connect to server. Failed big time. Ple...,"[connect, server, failed, big, time, please, f..."


In [44]:
text_tokens = []
 
for vet in data['text_tokens'].values:
      text_tokens.extend(vet)
 
num_words = len((list(set(text_tokens))))

num_words

8551

In [45]:
from keras.preprocessing.text import Tokenizer

tok = Tokenizer(num_words=num_words)
tok.fit_on_texts(data['text_tokens'].values)

data['X'] = tok.texts_to_sequences(data['text_tokens'])
 
data[['content_review', 'text_tokens', 'X']].head()

Unnamed: 0,content_review,text_tokens,X
0,"I love the app, I have used it for years The o...","[love, app, used, years, problem, new, update,...","[3, 1, 25, 76, 46, 20, 9, 485, 54, 538, 12, 16]"
1,It would be better if they actually released t...,"[would, better, actually, released, messages, ...","[5, 37, 187, 1527, 549, 194, 3378]"
2,"Needs to be an option to pay monthly, or to op...","[needs, option, pay, monthly, open, deleted, p...","[75, 118, 82, 716, 119, 231, 158, 97]"
3,Please let there be a one week free trial I wo...,"[please, let, one, week, free, trial, would, g...","[12, 107, 10, 232, 52, 2327, 5, 8, 378, 5, 6, ..."
4,Can\'t connect to server. Failed big time. Ple...,"[connect, server, failed, big, time, please, f...","[870, 1652, 1827, 356, 7, 12, 16, 46]"


In [46]:
data['num_words'] = data['text_tokens'].apply(lambda x : len(x))
 
max_num_words = data['num_words'].max()
 
data[['content_review', 'text_tokens', 'num_words']].nlargest(5, 'num_words')

Unnamed: 0,content_review,text_tokens,num_words
2884,"Pokmon go, is an awesome game, but apparently ...","[pokmon, go, awesome, game, apparently, nianti...",166
4454,I have to say that after using this app compar...,"[say, using, app, compared, multitude, weather...",107
2934,I appreciate the effort that has gone into thi...,"[appreciate, effort, gone, game, time, still, ...",105
2914,"The fourth is being led into by my third, and ...","[fourth, led, third, new, legendary, raids, le...",103
1278,The filters offered within this application ar...,"[filters, offered, within, application, really...",99


In [47]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
 
X = pad_sequences(maxlen=max_num_words, sequences=data['X'], value=0, padding='post', truncating='post')
y = data[labels].values
 
print('Dimensão do X: {}'.format(X.shape))
print('Dimensão do y: {}'.format(y.shape))

Dimensão do X: (6000, 166)
Dimensão do y: (6000, 5)


In [48]:
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=42, test_size=0.2)
(X_tr.shape, y_tr.shape), (X_te.shape, y_te.shape)

(((4800, 166), (4800, 5)), ((1200, 166), (1200, 5)))

In [49]:
from skmultilearn.model_selection import iterative_train_test_split

np.random.seed(42)
X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size=0.2)
(X_train.shape, y_train.shape), (X_test.shape, y_test.shape)

(((4798, 166), (4798, 5)), ((1202, 166), (1202, 5)))

In [53]:
from sklearn.svm import SVC
from xgboost import XGBClassifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.metrics import accuracy_score

BR = BinaryRelevance(classifier=SVC(), require_dense=[False, True])

BR.fit(X_train, y_train)

y_pred = BR.predict(X_test)

print("Acurácia = ", accuracy_score(y_test, y_pred))

Acurácia =  0.5507487520798668


In [54]:
BR.fit(X_tr, y_tr)

y_pr = BR.predict(X_te)

print("Acurácia = ", accuracy_score(y_te, y_pr))

Acurácia =  0.5491666666666667


In [55]:
from skmultilearn.problem_transform import ClassifierChain

CC = ClassifierChain(classifier=XGBClassifier(), require_dense=[False, True])

CC.fit(X_train, y_train)

y_pred = CC.predict(X_test)

print("Acurácia = ", accuracy_score(y_test, y_pred))

Acurácia =  0.6031613976705491


In [56]:
CC.fit(X_tr, y_tr)

y_pr = CC.predict(X_te)

print("Acurácia = ", accuracy_score(y_te, y_pr))

Acurácia =  0.5916666666666667


In [57]:
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier

LP = LabelPowerset(classifier=RandomForestClassifier(n_estimators=100), require_dense=[False, True])

LP.fit(X_train, y_train)

y_pred = LP.predict(X_test)

print("Acurácia = ", accuracy_score(y_test, y_pred))

Acurácia =  0.6039933444259568


In [58]:
LP.fit(X_tr, y_tr)

y_pr = LP.predict(X_te)

print("Acurácia = ", accuracy_score(y_te, y_pr))

Acurácia =  0.6075
