# Task 3:
Multiple Mixture components using labeled data: You will relax the assumption made in the
first 2 experiments. You will consider that a single news article can belong to several subtopics
and experiment with a Naive Bayes classifier using multiple mixture components on the labeled
dataset.

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn import metrics
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [2]:
train = pd.read_csv('../data/train_cleaned_stemed_reuters.csv', delimiter = ',')
test = pd.read_csv('../data/test_cleaned_stemed_reuters.csv', delimiter = ',')
train 

Unnamed: 0.1,Unnamed: 0,X,Y
0,0,bahia cocoa review shower continu throughout w...,['cocoa']
1,1,comput termin system lt cpml complet sale comp...,['acq']
2,2,n z trade bank deposit growth rise slightli ne...,['money-supply']
3,3,nation amus up viacom lt via bid viacom intern...,['acq']
4,4,roger lt rog see st qtr net significantli roge...,['earn']
...,...,...,...
7764,7764,u k money market shortag forecast revis bank e...,"['interest', 'money-fx']"
7765,7765,knight ridder inc lt krn set quarterli qtli di...,['earn']
7766,7766,technitrol inc lt tnl set quarterli qtli div c...,['earn']
7767,7767,nationwid cellular servic inc lt ncel th qtr s...,['earn']


In [3]:
train['Y'] = train.Y.apply(lambda x: re.sub('\s+', '', x))
train['Y'] = train.Y.apply(lambda x: re.sub('\'', '', x))
train['Y'] = train.Y.apply(lambda x: x[1:-1].split(','))

train["Y"][7764]

['interest', 'money-fx']

In [4]:
test['Y'] = test.Y.apply(lambda x: re.sub('\s+', '', x))
test['Y'] = test.Y.apply(lambda x: re.sub('\'', '', x))
test['Y'] = test.Y.apply(lambda x: x[1:-1].split(','))

In [5]:
train_X = np.array(train["X"])
train_Y = np.array(train["Y"])
train_X.shape

(7769,)

In [6]:
test_X = np.array(test["X"])
test_Y = np.array(test["Y"])
test_X[:2]

array(['asian export fear damag u japan rift mount trade friction u japan rais fear among mani asia export nation row could inflict far reach econom damag businessmen offici said told reuter correspond asian capit u move japan might boost protectionist sentiment u lead curb american import product export said conflict would hurt long run short term tokyo loss might gain u said impos mln dlr tariff import japanes electron good april retali japan alleg failur stick pact sell semiconductor world market cost unoffici japanes estim put impact tariff billion dlr spokesmen major electron firm said would virtual halt export product hit new tax abl busi said spokesman lead japanes electron firm matsushita electr industri co ltd lt mc tariff remain place length time beyond month mean complet eros export good subject tariff u said tom murtha stock analyst tokyo offic broker lt jame capel co taiwan businessmen offici also worri awar serious u threat japan serv warn us said senior taiwanes trade of

In [7]:
vectorizer = TfidfVectorizer()
train_data_vectors = vectorizer.fit_transform(train_X)
train_data_vectors.shape

(7769, 18034)

In [8]:
test_data_vectors = vectorizer.transform(test_X)
test_data_vectors.shape

(3019, 18034)

In [9]:
bayes_classifier = MultinomialNB(alpha=0.01)
bayes_classifier

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [10]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
new_y = mlb.fit_transform(train_Y)
new_y

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
new_y.shape

(7769, 90)

In [12]:
for n in new_y:
    if sum(n) > 1:
        print(n)
        break

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]


In [13]:
# bayes_classifier.fit(train_data_vectors,new_y)

from sklearn.multioutput import MultiOutputClassifier
multi_target = MultiOutputClassifier(bayes_classifier, n_jobs=-1)
multi_target.fit(train_data_vectors,new_y)


MultiOutputClassifier(estimator=MultinomialNB(alpha=0.01, class_prior=None,
                                              fit_prior=True),
                      n_jobs=-1)

In [14]:
test_predict = multi_target.predict(test_data_vectors)
test_predict

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [15]:
test_predict[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0])

In [16]:
new_y_test = mlb.transform(test_Y)
new_y_test[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0])

In [17]:
acc = accuracy_score(new_y_test, test_predict)
acc

0.6840013249420338

In [18]:
from sklearn import metrics
print(metrics.classification_report(new_y_test, test_predict))

              precision    recall  f1-score   support

           0       0.96      0.81      0.88       719
           1       1.00      0.35      0.52        23
           2       0.89      0.57      0.70        14
           3       0.85      0.37      0.51        30
           4       0.50      0.11      0.18        18
           5       0.00      0.00      0.00         1
           6       0.93      0.78      0.85        18
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.93      0.89      0.91        28
          10       0.60      0.50      0.55        18
          11       0.00      0.00      0.00         1
          12       0.52      0.71      0.60        56
          13       0.88      0.35      0.50        20
          14       0.00      0.00      0.00         2
          15       0.55      0.21      0.31        28
          16       0.00      0.00      0.00         1
          17       0.87    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [19]:
plt.figure(figsize=(14,14))
sns.set()
sns.heatmap(metrics.confusion_matrix(new_y_test, test_predict) , annot=True, fmt="d", linewidths=.5)
plt.title("Confusion Matrix")

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.

<Figure size 1008x1008 with 0 Axes>