In [1]:
import pickle
import re
import pandas as pd
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [2]:
def preprocess(text):
    if isinstance((text), (str)):
        text = re.sub('<[^>]*>', ' ', text)
        text = re.sub('[\W]+', ' ', text.lower())
        return text
    if isinstance((text), (list)):
        return_list = []
        for i in range(len(text)):
            temp_text = re.sub('<[^>]*>', '', text[i])
            temp_text = re.sub('[\W]+', '', temp_text.lower())
            return_list.append(temp_text)
        return(return_list)

In [3]:
data = pd.read_csv("../Data/IAB/iab_text_tiers.csv")
stop = stopwords.words('english')

In [4]:
data = data[data['with_bs4'].notna()]
data = data[data['with_justext'].notna()]
data = data[data['with_bs4'] != "exceeded"]
data = data[data['with_justext'] != "exceeded"]
data['with_bs4'] = data['with_bs4'].apply(preprocess)
data['with_justext'] = data['with_justext'].apply(preprocess)

In [5]:
data = data.reset_index()

In [6]:
data.head()

Unnamed: 0.1,index,Unnamed: 0,Unique ID,Parent,Name,Position,URL,Title,Snippet,Tier 1,Tier 2,Tier 3,Tier 4,with_bs4,with_justext
0,0,0,1,-,Automotive,1,https://www.merriam-webster.com/dictionary/aut...,Automotive | Definition of Automotive by …,Automotive definition is - self-propelled. How...,Automotive,-,-,-,automotive definition of automotive by merria...,these example sentences are selected automati...
1,2,2,1,-,Automotive,3,https://www.dictionary.com/browse/automotive,Automotive | Definition of Automotive at …,"Automotive definition, pertaining to the desig...",Automotive,-,-,-,automotive definition of automotive at dictio...,general motors warned that a global semicondu...
2,3,3,1,-,Automotive,4,https://en.wikipedia.org/wiki/Automotive_industry,Automotive industry - Wikipedia,The automotive industry comprises a wide range...,Automotive,-,-,-,automotive industry wikipedia automotive indu...,the automotive industry began in the 1860s wi...
3,12,12,2,1,Auto Body Styles,3,https://www.autoevolution.com/news/2021-dacia-...,"2021 Dacia Logan Reimagined With Coupe, Pickup, …","As for the Maximum Capacity Vehicle, cargo vol...",Automotive,Auto Body Styles,-,-,2021 dacia logan reimagined with coupe pickup...,2021 dacia logan reimagined with coupe pickup...
4,15,15,2,1,Auto Body Styles,6,https://autopartsfair.com/exterior_parts/,Auto Body Parts Store - Exterior Body Parts for …,"Choose, compare & buy from a wide range of aut...",Automotive,Auto Body Styles,-,-,auto body parts store exterior body parts for...,discount used auto parts store this website i...


### using with_bs4 column to train the classifier

In [7]:
tfidf = TfidfVectorizer()
text_count = tfidf.fit_transform(data['with_bs4'])
y = np.asarray(data[data.columns[9:13]])

In [8]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(text_count, y, test_size = 0.25, random_state = 5)

In [9]:
len(mlb.classes_)

1177

In [10]:
knnClf = KNeighborsClassifier()
knnClf.fit(x_train, y_train)
knnpred = knnClf.predict(x_test)

In [11]:
print('accuracy_score_KNN', metrics.accuracy_score(knnpred, y_test))
print('recall_macro_score', metrics.recall_score(knnpred, y_test,average = 'macro', zero_division = 1))
print('recall_micro_score', metrics.recall_score(knnpred, y_test,average = 'micro', zero_division = 1))
print('recall_weighted_score', metrics.recall_score(knnpred, y_test,average = 'weighted', zero_division = 1))
print('f1_macro_score', metrics.f1_score(knnpred, y_test,average = 'macro', zero_division = 1))
print('f1_micro_score', metrics.f1_score(knnpred, y_test,average = 'micro', zero_division = 1))
print('f1_weighted_score', metrics.f1_score(knnpred, y_test,average = 'weighted', zero_division = 1))
print('precision_macro_score', metrics.precision_score(knnpred, y_test,average = 'macro', zero_division = 1))
print('precision_micro_score', metrics.precision_score(knnpred, y_test,average = 'micro', zero_division = 1))
print('precision_weighted_score', metrics.precision_score(knnpred, y_test,average = 'weighted', zero_division = 1))

accuracy_score_KNN 0.3300536672629696
recall_macro_score 0.9259824019645818
recall_micro_score 0.8782608695652174
recall_weighted_score 0.8782608695652174
f1_macro_score 0.45082797272703645
f1_micro_score 0.7563876651982379
f1_weighted_score 0.8359921651500484
precision_macro_score 0.44160518563590745
precision_micro_score 0.6642166344294004
precision_weighted_score 0.8260345197355742


### using with_justext column to train the classifier

In [12]:
tfidf = TfidfVectorizer()
text_count = tfidf.fit_transform(data['with_justext'])
y = np.asarray(data[data.columns[9:13]])

In [13]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(text_count, y, test_size = 0.25, random_state = 5)

In [14]:
len(mlb.classes_)

1177

In [None]:
knnClf = KNeighborsClassifier()
knnClf.fit(x_train, y_train)
knnpred = knnClf.predict(x_test)

In [None]:
print('accuracy_score_KNN', metrics.accuracy_score(knnpred, y_test))
print('recall_macro_score', metrics.recall_score(knnpred, y_test,average = 'macro', zero_division = 1))
print('recall_micro_score', metrics.recall_score(knnpred, y_test,average = 'micro', zero_division = 1))
print('recall_weighted_score', metrics.recall_score(knnpred, y_test,average = 'weighted', zero_division = 1))
print('f1_macro_score', metrics.f1_score(knnpred, y_test,average = 'macro', zero_division = 1))
print('f1_micro_score', metrics.f1_score(knnpred, y_test,average = 'micro', zero_division = 1))
print('f1_weighted_score', metrics.f1_score(knnpred, y_test,average = 'weighted', zero_division = 1))
print('precision_macro_score', metrics.precision_score(knnpred, y_test,average = 'macro', zero_division = 1))
print('precision_micro_score', metrics.precision_score(knnpred, y_test,average = 'micro', zero_division = 1))
print('precision_weighted_score', metrics.precision_score(knnpred, y_test,average = 'weighted', zero_division = 1))