In [1]:
## importing all the required libraries to perform the classification task
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
## importing the nltk library to remove the stop words
import nltk
from nltk.corpus import stopwords

In [3]:
## assigning the the columns to a variable yelptrainx
yelptrainx = pd.read_csv('yelpmultilabelX.csv')

In [4]:
## assigning the labels to a variable yelptrainy
yelptrainy = pd.read_csv('yelpmultilabelY.csv')

In [5]:
##assigning the text reviews to X
X = yelptrainx['text']

In [6]:
##shape of yelptrainx
yelptrainx.shape

(4645, 10)

In [7]:
## Function to remove punctuations and stop words from each of the review
import string
def text_process(text):
        nopunc = [char for char in text if char not in string.punctuation]
        nopunc = ''.join(nopunc)
        return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [8]:
## importing count vectorizer to convert the list of tokens obtained from the above text_process function
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
## an instance is fitted to convert all the text reviews into vector format
bow_transformer = CountVectorizer(analyzer=text_process).fit(X)

In [10]:
## size of the vocabulary stored in the vectorizer
len(bow_transformer.vocabulary_)

29725

In [11]:
## transforming our X dataframe into a sparse matrix
X = bow_transformer.transform(X)

In [12]:
## shape of sparse matrix
print('Shape of Sparse Matrix: ', X.shape)

Shape of Sparse Matrix:  (4645, 29725)


In [13]:
## Amount of non zero occurances
print('Amount of Non-Zero occurrences: ', X.nnz)

Amount of Non-Zero occurrences:  321139


In [14]:
## density of X
density = (100.0 * X.nnz / (X.shape[0] * X.shape[1]))

In [15]:
print('Density: {}'.format((density)))

Density: 0.23258701715854246


In [16]:
## importing train test split
from sklearn.model_selection import train_test_split

In [17]:
## assingning the yelptrainy to variable y
y=yelptrainy

In [18]:
## splitting the X and y into train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [20]:
## importing a multilabel label power set library 
from skmultilearn.problem_transform import LabelPowerset

In [21]:
## import gaussian naive bayes
from sklearn.naive_bayes import GaussianNB

In [22]:
## building a classifier using Gaussian NB in Label Powerset
classifier = LabelPowerset(GaussianNB())

In [23]:
##fitting the classifier
classifier.fit(X_train, y_train)

LabelPowerset(classifier=GaussianNB(priors=None), require_dense=[True, True])

In [24]:
## predicting the label against the test data 
predictions = classifier.predict(X_test)

In [25]:
## importing the accuracy score and determining the accuracy by testing the predicted label with test data original label
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)

0.35306781485468247

In [26]:
## import KNN
from sklearn.neighbors import KNeighborsClassifier

In [27]:
## building a classifier using KNN in Label powerset
classifier = LabelPowerset(KNeighborsClassifier())

In [28]:
##fitting the classifier
classifier.fit(X_train, y_train)

LabelPowerset(classifier=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       require_dense=[True, True])

In [29]:
## predicting the label against the test data 
predictions = classifier.predict(X_test)

In [30]:
## importing the accuracy score and determining the accuracy by testing the predicted label with test data original label
accuracy_score(y_test,predictions)

0.4940796555435953

In [31]:
## import Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB

In [32]:
## building a classifier using Multinomial Naive Bayes in CLabel powerset
classifier = LabelPowerset(MultinomialNB())

In [33]:
##fitting the classifier
classifier.fit(X_train, y_train)

LabelPowerset(classifier=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       require_dense=[True, True])

In [34]:
## predicting the label against the test data 
predictions = classifier.predict(X_test)

In [35]:
## importing the accuracy score and determining the accuracy by testing the predicted label with test data original label
accuracy_score(y_test,predictions)

0.5263724434876211

In [36]:
## import Decision Tree
from sklearn.tree import DecisionTreeClassifier

In [37]:
## importing random forest classifier
from sklearn.ensemble import RandomForestClassifier

In [39]:
## building a classifier using Random Forest Classifier in label powerset
classifier = LabelPowerset(RandomForestClassifier())

In [40]:
##fitting the classifier
classifier.fit(X_train, y_train)

LabelPowerset(classifier=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       require_dense=[True, True])

In [41]:
## predicting the label against the test data
predictions = classifier.predict(X_test)

In [42]:
## importing the accuracy score and determining the accuracy by testing the predicted label with test data original label
accuracy_score(y_test,predictions)

0.5974165769644779

In [None]:
## building a classifier using Deciscion tree using label powerset
classifier = LabelPowerset(DecisionTreeClassifier())

In [None]:
##fitting the classifier
classifier.fit(X_train, y_train)

In [48]:
## predicting the label against the test data 
predictions = classifier.predict(X_test)

In [49]:
## importing the accuracy score and determining the accuracy by testing the predicted label with test data original label
accuracy_score(y_test,predictions)

0.9181916038751345

In [50]:
## import Linear SVC
from sklearn.svm import LinearSVC

In [51]:
## building a classifier using Linear SVC Classifier in classifier chain
classifier = LabelPowerset(LinearSVC())

In [52]:
## fitting the classifier
classifier.fit(X_train, y_train)

LabelPowerset(classifier=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       require_dense=[True, True])

In [53]:
## predicting the label against the test data 
predictions = classifier.predict(X_test)

In [54]:
## importing the accuracy score and determining the accuracy by testing the predicted label with test data original label
accuracy_score(y_test,predictions)

0.759956942949408

In [55]:
## import Extra tree classifier
from sklearn.tree import ExtraTreeClassifier

In [56]:
## building a classifier using Extra tree Classifier in Label powerset
classifier = LabelPowerset(ExtraTreeClassifier())

In [57]:
## fitting the classifier
classifier.fit(X_train, y_train)

LabelPowerset(classifier=ExtraTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, random_state=None,
          splitter='random'),
       require_dense=[True, True])

In [58]:
## predicting the label against the test data 
predictions = classifier.predict(X_test)

In [59]:
## importing the accuracy score and determining the accuracy by testing the predicted label with test data original label
accuracy_score(y_test,predictions)

0.4294940796555436

In [61]:
## fitting the extra trees classifier in label powerset and predicting the label and determining the accuracy
from sklearn.ensemble import ExtraTreesClassifier

classifier = LabelPowerset(ExtraTreesClassifier())

classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

accuracy_score(y_test,predictions)


0.5059203444564048

In [17]:
## fitting the MLP classifier in label powerset and predicting the label and determining the accuracy
from sklearn.neural_network import MLPClassifier
classifier = LabelPowerset(MLPClassifier())
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)

0.5629709364908504