### load and preprocess

In [1]:
import pickle
import re
import pandas as pd
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
def preprocess(text):
    if isinstance((text), (str)):
        text = re.sub('<[^>]*>', ' ', text)
        text = re.sub('[\W]+', ' ', text.lower())
        return text
    if isinstance((text), (list)):
        return_list = []
        for i in range(len(text)):
            temp_text = re.sub('<[^>]*>', '', text[i])
            temp_text = re.sub('[\W]+', '', temp_text.lower())
            return_list.append(temp_text)
        return(return_list)

In [3]:
data = pd.read_csv("../Data/IAB/iab_text_tiers.csv")
stop = stopwords.words('english')

In [4]:
data = data[data['with_bs4'].notna()]
data = data[data['with_justext'].notna()]
data = data[data['with_bs4'] != "exceeded"]
data = data[data['with_justext'] != "exceeded"]
data['with_bs4'] = data['with_bs4'].apply(preprocess)
data['with_justext'] = data['with_justext'].apply(preprocess)

In [5]:
data = data.reset_index()

In [6]:
data.head()

Unnamed: 0.1,index,Unnamed: 0,Unique ID,Parent,Name,Position,URL,Title,Snippet,Tier 1,Tier 2,Tier 3,Tier 4,with_bs4,with_justext
0,0,0,1,-,Automotive,1,https://www.merriam-webster.com/dictionary/aut...,Automotive | Definition of Automotive by …,Automotive definition is - self-propelled. How...,Automotive,-,-,-,automotive definition of automotive by merria...,these example sentences are selected automati...
1,2,2,1,-,Automotive,3,https://www.dictionary.com/browse/automotive,Automotive | Definition of Automotive at …,"Automotive definition, pertaining to the desig...",Automotive,-,-,-,automotive definition of automotive at dictio...,general motors warned that a global semicondu...
2,3,3,1,-,Automotive,4,https://en.wikipedia.org/wiki/Automotive_industry,Automotive industry - Wikipedia,The automotive industry comprises a wide range...,Automotive,-,-,-,automotive industry wikipedia automotive indu...,the automotive industry began in the 1860s wi...
3,12,12,2,1,Auto Body Styles,3,https://www.autoevolution.com/news/2021-dacia-...,"2021 Dacia Logan Reimagined With Coupe, Pickup, …","As for the Maximum Capacity Vehicle, cargo vol...",Automotive,Auto Body Styles,-,-,2021 dacia logan reimagined with coupe pickup...,2021 dacia logan reimagined with coupe pickup...
4,15,15,2,1,Auto Body Styles,6,https://autopartsfair.com/exterior_parts/,Auto Body Parts Store - Exterior Body Parts for …,"Choose, compare & buy from a wide range of aut...",Automotive,Auto Body Styles,-,-,auto body parts store exterior body parts for...,discount used auto parts store this website i...


### using with_bs4 column + NB to train the classifier

In [7]:
tfidf = TfidfVectorizer(max_df = 0.9, min_df = 5)
text_count = tfidf.fit_transform(data['with_bs4'])
y = np.asarray(data[data.columns[9:13]])

In [8]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(text_count, y, test_size = 0.25, random_state = 5)

In [9]:
x_train[0], y_train[0]

(<1x48659 sparse matrix of type '<class 'numpy.float64'>'
 	with 450 stored elements in Compressed Sparse Row format>,
 array([1, 0, 0, ..., 0, 0, 0]))

In [10]:
mlb.classes_

array(['-', '3-D Graphics', 'Abkhaz', ..., 'Zulu', 'eSports',
       'the Democratic Republic of the Congo'], dtype=object)

In [11]:
model = MultinomialNB(alpha = 0.1)
model = OneVsRestClassifier(model)
model.fit(x_train, y_train)



OneVsRestClassifier(estimator=MultinomialNB(alpha=0.1))

In [12]:
y_pred = model.predict(x_test)

In [13]:
print('accuracy_score_KNN', metrics.accuracy_score(y_pred, y_test))
print('recall_macro_score', metrics.recall_score(y_pred, y_test, average = 'macro', zero_division = 1))
print('recall_micro_score', metrics.recall_score(y_pred, y_test, average = 'micro', zero_division = 1))
print('recall_weighted_score', metrics.recall_score(y_pred, y_test, average = 'weighted', zero_division = 1))
print('f1_macro_score', metrics.f1_score(y_pred, y_test,average = 'macro', zero_division = 1))
print('f1_micro_score', metrics.f1_score(y_pred, y_test,average = 'micro', zero_division = 1))
print('f1_weighted_score', metrics.f1_score(y_pred, y_test,average = 'weighted', zero_division = 1))
print('precision_macro_score', metrics.precision_score(y_pred, y_test,average = 'macro', zero_division = 1))
print('precision_micro_score', metrics.precision_score(y_pred, y_test,average = 'micro', zero_division = 1))
print('precision_weighted_score', metrics.precision_score(y_pred, y_test,average = 'weighted', zero_division = 1))

accuracy_score_KNN 0.005813953488372093
recall_macro_score 0.015627826685731032
recall_micro_score 0.32972680551798755
recall_weighted_score 0.32972680551798755
f1_macro_score 0.011565109282088853
f1_micro_score 0.3880929640241961
f1_weighted_score 0.3241680524447632
precision_macro_score 0.13182277115711555
precision_micro_score 0.47156673114119924
precision_weighted_score 0.3980819385480484


In [14]:
i = 4313
s = data.iloc[i].with_bs4
print(i, s)
s = tfidf.transform([s])
mlb.inverse_transform(model.predict(s))

4313  10 casual style tips for men who want to look sharp skip to content restart your style main menu home blog men s style guide effortless outfits grooming about contact 10 casual style tips for guys who want to look sharp 144 comments by robert 4367shares share tweet pin annoying isn t it you want to dress better but most style advice revolves around suiting up or just around whatever s trendy this season but that s not what you re looking for you just want to make a better first impression on people you meet in everyday life you just want to look good in your clothes without looking too flashy you just want to look like a better dressed version of you and that version of you still likes to keep it casual so what you re really looking for is some casual style tips for guys who want to look sharp outside of a suit and tie well you ve come to the right place let s begin 1 stop dressing like a boy dress like a grown up many men approach their casual style from the wrong angle they aim

[('-', 'Style & Fashion')]

### using with_justext column  + NB to train the classifier

In [15]:
tfidf = TfidfVectorizer(max_df = 0.9, min_df = 5)
text_count = tfidf.fit_transform(data['with_justext'])
y = np.asarray(data[data.columns[9:13]])

In [16]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(text_count, y, test_size = 0.25, random_state = 5)

In [17]:
x_train[0], y_train[0]

(<1x30955 sparse matrix of type '<class 'numpy.float64'>'
 	with 411 stored elements in Compressed Sparse Row format>,
 array([1, 0, 0, ..., 0, 0, 0]))

In [18]:
len(mlb.classes_)

1177

In [19]:
model = MultinomialNB(alpha = 0.1)
model = OneVsRestClassifier(model)
model.fit(x_train, y_train)



OneVsRestClassifier(estimator=MultinomialNB(alpha=0.1))

In [20]:
y_pred = model.predict(x_test)

In [21]:
print('accuracy_score_KNN', metrics.accuracy_score(y_pred, y_test))
print('recall_macro_score', metrics.recall_score(y_pred, y_test, average = 'macro', zero_division = 1))
print('recall_micro_score', metrics.recall_score(y_pred, y_test, average = 'micro', zero_division = 1))
print('recall_weighted_score', metrics.recall_score(y_pred, y_test, average = 'weighted', zero_division = 1))
print('f1_macro_score', metrics.f1_score(y_pred, y_test,average = 'macro', zero_division = 1))
print('f1_micro_score', metrics.f1_score(y_pred, y_test,average = 'micro', zero_division = 1))
print('f1_weighted_score', metrics.f1_score(y_pred, y_test,average = 'weighted', zero_division = 1))
print('precision_macro_score', metrics.precision_score(y_pred, y_test,average = 'macro', zero_division = 1))
print('precision_micro_score', metrics.precision_score(y_pred, y_test,average = 'micro', zero_division = 1))
print('precision_weighted_score', metrics.precision_score(y_pred, y_test,average = 'weighted', zero_division = 1))

accuracy_score_KNN 0.007602862254025045
recall_macro_score 0.9943962619243805
recall_micro_score 0.8860042482888837
recall_weighted_score 0.8860042482888837
f1_macro_score 0.13422977523656038
f1_micro_score 0.6260840560373583
f1_weighted_score 0.8604080138736665
precision_macro_score 0.1314882021268148
precision_micro_score 0.4840747904577692
precision_weighted_score 0.8546483491913421
