In [1]:
%run imports.ipynb
%run utils.ipynb

In [2]:
data = pd.read_csv('bases/train.csv')

In [4]:
data.head()

Unnamed: 0,content_review,sentiment_analysis,domain_name,label
0,"I love the app, I have used it for years The o...",Positive,PhotosVideos,Usa
1,It would be better if they actually released t...,Positive,PhotosVideos,Mis
2,"Needs to be an option to pay monthly, or to op...",Negative,PhotosVideos,Mis
3,Please let there be a one week free trial I wo...,Positive,PhotosVideos,Mis
4,Can\'t connect to server. Failed big time. Ple...,Negative,PhotosVideos,Sup


In [5]:
from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

In [7]:
data.iloc[0,0]

"I love the app, I have used it for years The only problem is that with the new update, it won't allow me to add videos. Please fix it."

In [9]:
clean_text(data.iloc[0,0])

'love app year problem new updat won allow add video fix'

In [29]:
df_x = data[['content_review']]
df_y = data[['label']]

from sklearn.preprocessing import MultiLabelBinarizer

y = []
for index, row in df_y.iterrows():
    y.append(set(row['label'].split(',')))
    
mlb = MultiLabelBinarizer()
encoded_y = mlb.fit_transform(y)

In [12]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from tqdm import tqdm

import multiprocessing
import numpy as np

class Doc2VecTransformer(BaseEstimator):

    def __init__(self, vector_size=100, learning_rate=0.02, epochs=20, field=None):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1
        self.field = field

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(clean_text(row[str(self.field)]).split(), [index]) for index, row in df_x.iterrows()]
        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers)

        for epoch in range(self.epochs):
            model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(clean_text(row[str(self.field)]).split())
                                     for index, row in df_x.iterrows()]))

In [13]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(df_x, encoded_y)


In [15]:
from sklearn.pipeline import Pipeline
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier


from sklearn.pipeline import FeatureUnion

fu = FeatureUnion(transformer_list=[('title_doc2vec',Doc2VecTransformer(field='content_review'))])

binary_rel_model = BinaryRelevance(RandomForestClassifier(n_jobs=-1))
multi_label_rf_br_model = Pipeline(steps=[
                           ('feature_union', fu),
                           ('binary_relevance', binary_rel_model)
                        ])

In [16]:
import sklearn.metrics as metrics

def hamming_loss(multi_label_model_pipeline,train_x, train_y, test_x, test_y):
    predictions_test_y = multi_label_model_pipeline.predict(test_x)
    return metrics.hamming_loss(y_true=test_y, y_pred=predictions_test_y)

In [26]:
multi_label_rf_br_model.fit(train_x, train_y)
print('Hamming loss for test data :', hamming_loss(multi_label_rf_br_model,train_x,train_y,test_x,test_y))

100%|█████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 2257699.52it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 1127703.17it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 2258509.99it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 1502736.31it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 1503813.88it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 2257429.49it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 2257429.49it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 2258239.77it/s]
100%|███████████████████████████████████

Hamming loss for test data : 0.16986666666666667


In [27]:
from skmultilearn.problem_transform import ClassifierChain

classifier_chain_model = ClassifierChain(RandomForestClassifier(n_jobs=-1))
multi_label_rf_cc_model = Pipeline(steps=[
                           ('feature_union', fu),
                           ('classifier_chain', classifier_chain_model)
                        ])
multi_label_rf_cc_model.fit(train_x, train_y)
print('Hamming loss for test data :', hamming_loss(multi_label_rf_cc_model,train_x,train_y,test_x,test_y))

100%|█████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 2430697.75it/s]
100%|██████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 902691.09it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 1127568.43it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 2257159.53it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 1811881.35it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 1501182.53it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 1504053.55it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4500/4500 [00:00<00:00, 1128242.45it/s]
100%|███████████████████████████████████

Hamming loss for test data : 0.16866666666666666


In [22]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt 

def plot_roc_curve(x=None, y=None, classes=[],title=None):
    
    lw=2
    plt.figure(figsize=(12,6))
    for _class in classes:
        
         class_index = np.where(mlb.classes_ == _class)[0][0]
         probs = binary_rel_model.classifiers_[class_index].predict_proba(fu.transform(x))[:,1]   
         model_fpr, model_tpr, _ = roc_curve(y[:,class_index], probs)
         roc_auc = auc(model_fpr, model_tpr)
         plt.plot(model_fpr, model_tpr,
             lw=lw, label='ROC curve -' + _class + '- (area = %0.2f)' % roc_auc)


    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.show()

In [24]:
#plot_roc_curve(x=test_x, y=test_y, classes=['normal-distribution','data-visualization','estimation'], 
               #title='ROC curve')