In [1]:
%run imports.ipynb
%run utils.ipynb

In [2]:
data = pd.read_csv('bases/train.csv')

In [3]:
data.head()

Unnamed: 0,content_review,sentiment_analysis,domain_name,label
0,"I love the app, I have used it for years The o...",Positive,PhotosVideos,Usa
1,It would be better if they actually released t...,Positive,PhotosVideos,Mis
2,"Needs to be an option to pay monthly, or to op...",Negative,PhotosVideos,Mis
3,Please let there be a one week free trial I wo...,Positive,PhotosVideos,Mis
4,Can\'t connect to server. Failed big time. Ple...,Negative,PhotosVideos,Sup


In [9]:
from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from tqdm import tqdm

import multiprocessing
import numpy as np

class Doc2VecTransformer(BaseEstimator):

    def __init__(self, vector_size=100, learning_rate=0.02, epochs=20, field=None):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1
        self.field = field

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(clean_text(row[str(self.field)]).split(), [index]) for index, row in df_x.iterrows()]
        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers)

        for epoch in range(self.epochs):
            model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(clean_text(row[str(self.field)]).split())
                                     for index, row in df_x.iterrows()]))

In [6]:
data['clean_text'] = data['content_review'].apply(clean_text)

In [8]:
labels  = []

for i in data['label']:
    a = i.split(',')
    labels.append(a)

mlb = MultiLabelBinarizer(classes=("Dep", "Per", "Sup", "Usa", "Mis"))
labelsdf = pd.DataFrame(mlb.fit_transform(labels), columns=['Dep', 'Per', 'Sup', 'Usa', 'Mis']) 
data = data.assign(label_dep=labelsdf['Dep'].values, label_per=labelsdf['Per'].values, label_sup=labelsdf['Sup'].values, label_usa=labelsdf['Usa'].values, label_mis=labelsdf['Mis'].values)
data

Unnamed: 0,content_review,sentiment_analysis,domain_name,label,clean_text,label_dep,label_per,label_sup,label_usa,label_mis
0,"I love the app, I have used it for years The o...",Positive,PhotosVideos,Usa,love app year problem new updat won allow add ...,0,0,0,1,0
1,It would be better if they actually released t...,Positive,PhotosVideos,Mis,better actual releas messag instead beta,0,0,0,0,1
2,"Needs to be an option to pay monthly, or to op...",Negative,PhotosVideos,Mis,need option pai monthli open delet pictur,0,0,0,0,1
3,Please let there be a one week free trial I wo...,Positive,PhotosVideos,Mis,let week free trial premium like try,0,0,0,0,1
4,Can\'t connect to server. Failed big time. Ple...,Negative,PhotosVideos,Sup,connect server fail big time fix problem,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
5995,"I followed all the instructions airplane mode,...",Neutral,Lifestyle,Mis,follow instruct airplan mode listen sampl,0,0,0,0,1
5996,What a disappointment for 4.99 Ugh,Negative,Lifestyle,Mis,disappoint ugh,0,0,0,0,1
5997,I hate it ... this app does not work.,Negative,Lifestyle,Usa,hate app work,0,0,0,1,0
5998,"Im at 28 weeks, and I was able to listen to my...",Positive,Lifestyle,Mis,week abl listen babi heart beat need follow in...,0,0,0,0,1


In [13]:
from sklearn.model_selection import train_test_split

X = data['clean_text']
y = data.iloc[:, 5:10].values

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2)

In [14]:
from sklearn.pipeline import Pipeline
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier


from sklearn.pipeline import FeatureUnion

fu = FeatureUnion(transformer_list=[('title_doc2vec',Doc2VecTransformer(field='content_review'))])

binary_rel_model = BinaryRelevance(RandomForestClassifier(n_jobs=-1))
multi_label_rf_br_model = Pipeline(steps=[
                           ('feature_union', fu),
                           ('binary_relevance', binary_rel_model)
                        ])

import sklearn.metrics as metrics

def hamming_loss(multi_label_model_pipeline,train_x, train_y, test_x, test_y):
    predictions_test_y = multi_label_model_pipeline.predict(test_x)
    return metrics.hamming_loss(y_true=test_y, y_pred=predictions_test_y)

In [15]:
multi_label_rf_br_model.fit(train_x, train_y)
print('Hamming loss for test data :', hamming_loss(multi_label_rf_br_model,train_x,train_y,test_x,test_y))

AttributeError: 'Series' object has no attribute 'iterrows'