### load and preprocess

In [1]:
import pickle
import re
import pandas as pd
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [2]:
def preprocess(text):
    if isinstance((text), (str)):
        text = re.sub('<[^>]*>', ' ', text)
        text = re.sub('[\W]+', ' ', text.lower())
        return text
    if isinstance((text), (list)):
        return_list = []
        for i in range(len(text)):
            temp_text = re.sub('<[^>]*>', '', text[i])
            temp_text = re.sub('[\W]+', '', temp_text.lower())
            return_list.append(temp_text)
        return(return_list)

In [3]:
data = pd.read_csv("../Data/IAB/iab_text_tiers.csv")
stop = stopwords.words('english')

In [4]:
df = data[data['with_bs4'].notna()]
df = df[df['with_bs4'] != "exceeded"]
df['with_bs4'] = df['with_bs4'].apply(preprocess)
df = df[df['with_justext'].notna()]
df = df[df['with_justext'] != "exceeded"]
df['with_bs4'] = df['with_justext'].apply(preprocess)
df = df.reset_index()
print(len(df))

8943


In [5]:
df.head()

Unnamed: 0.1,index,Unnamed: 0,Unique ID,Parent,Name,Position,URL,Title,Snippet,Tier 1,Tier 2,Tier 3,Tier 4,with_bs4,with_justext
0,0,0,1,-,Automotive,1,https://www.merriam-webster.com/dictionary/aut...,Automotive | Definition of Automotive by …,Automotive definition is - self-propelled. How...,Automotive,-,-,-,these example sentences are selected automati...,These example sentences are selected automat...
1,2,2,1,-,Automotive,3,https://www.dictionary.com/browse/automotive,Automotive | Definition of Automotive at …,"Automotive definition, pertaining to the desig...",Automotive,-,-,-,general motors warned that a global semicondu...,General Motors warned that a global semicond...
2,3,3,1,-,Automotive,4,https://en.wikipedia.org/wiki/Automotive_industry,Automotive industry - Wikipedia,The automotive industry comprises a wide range...,Automotive,-,-,-,the automotive industry began in the 1860s wi...,The automotive industry began in the 1860s w...
3,12,12,2,1,Auto Body Styles,3,https://www.autoevolution.com/news/2021-dacia-...,"2021 Dacia Logan Reimagined With Coupe, Pickup, …","As for the Maximum Capacity Vehicle, cargo vol...",Automotive,Auto Body Styles,-,-,2021 dacia logan reimagined with coupe pickup...,2021 Dacia Logan Reimagined With Coupe Picku...
4,15,15,2,1,Auto Body Styles,6,https://autopartsfair.com/exterior_parts/,Auto Body Parts Store - Exterior Body Parts for …,"Choose, compare & buy from a wide range of aut...",Automotive,Auto Body Styles,-,-,discount used auto parts store this website i...,Discount Used Auto Parts Store This website...


### using with_bs4 column to train the classifier

In [6]:
tfidf = TfidfVectorizer()
text_count = tfidf.fit_transform(df['with_bs4'])
y = np.asarray(df[df.columns[9:13]])

In [7]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(text_count, y, test_size = 0.25, random_state = 5)

In [8]:
len(mlb.classes_)

1177

In [9]:
knnClf = KNeighborsClassifier()
knnClf.fit(x_train, y_train)
knnpred = knnClf.predict(x_test)

In [10]:
print('accuracy_score_KNN', metrics.accuracy_score(knnpred, y_test))
print('recall_macro_score', metrics.recall_score(knnpred, y_test,average = 'macro', zero_division = 1))
print('recall_micro_score', metrics.recall_score(knnpred, y_test,average = 'micro', zero_division = 1))
print('recall_weighted_score', metrics.recall_score(knnpred, y_test,average = 'weighted', zero_division = 1))
print('f1_macro_score', metrics.f1_score(knnpred, y_test,average = 'macro', zero_division = 1))
print('f1_micro_score', metrics.f1_score(knnpred, y_test,average = 'micro', zero_division = 1))
print('f1_weighted_score', metrics.f1_score(knnpred, y_test,average = 'weighted', zero_division = 1))
print('precision_macro_score', metrics.precision_score(knnpred, y_test,average = 'macro', zero_division = 1))
print('precision_micro_score', metrics.precision_score(knnpred, y_test,average = 'micro', zero_division = 1))
print('precision_weighted_score', metrics.precision_score(knnpred, y_test,average = 'weighted', zero_division = 1))

accuracy_score_KNN 0.3488372093023256
recall_macro_score 0.9148924830327165
recall_micro_score 0.8734241048915784
recall_weighted_score 0.8734241048915784
f1_macro_score 0.4641459427330578
f1_micro_score 0.7583187390542907
f1_weighted_score 0.8329658657790768
precision_macro_score 0.46137423088345414
precision_micro_score 0.6700193423597679
precision_weighted_score 0.8250198351480282


In [11]:
s = "Jump to navigation NEWSLIVE TV Home APPMAGAZINE HOMEMY FEEDVIDEOSMALAYALAMINDIAGAMINGFACT CHECKQUIZMOVIESHEALTHTECHSPORTSDIU NewsLifestyleYoga can help you keep heart diseases at bay Yoga can help you keep heart diseases at bay Yoga can become a way of life to help you deal with heart diseases. Priyanka Sharma New Delhi June 21, 2018UPDATED: June 21, 2018 18:59 IST Chair posture is proving beneficial for patients who can’t sit on the floor or have disabilities Chair posture is proving beneficial for patients who can’t sit on the floor or have disabilities Even in matters of the heart, literally speaking, yoga has an impact. A study by the All India Institute of Medical Sciences (AIIMS) has established that a person can regulate her/his heartbeats through yoga. AIIMS, in what it claimed to be the first study of its kind to have scientifically established the benefits of yoga, revealed that breathing in a particular rhythm has many positive effects on cardiovascular health. It also helps in overcoming anger, fear, stress and hypertension. Dr KK Deepak, author of the study and head of the Department of Physiology at AIIMS, told Mail Today: A slow yogic breathing can curtail the feeling of stress, fear, anger, tension and also regulate diabetes. The study has found that there is a correlation between heartbeat and blood pressure. Yogic breathing can synchronise the two which is a big deal.  The results of the study, funded by Indian Council for Medical Research (ICMR) and the Ministry of AYUSH, have been published in the latest issue of Indian Journal of Medical Research. When we do deep-breathing, blood pressure comes into the normal range. This indicates that there is a strong relationship between heart rate and blood pressure. If a persons heart rate falls, suddenly the BP will go up and if the BP comes down, the heart rate shoots up. Thus, creating a balance between the two is very important - which can be done by yogic breathing,  Dr Deepak said. In another study conducted by RML Hospital, doctors have found that practising asanas led to significant improvement in patients suffering from mental health illnesses, as compared with those who did other forms of exercise. We made two groups of patients yoga and non-yoga. The yoga group patients cognitive behaviour improved tremendously as compared to the patients doing other exercises, said (Prof) Smita N. Deshpande, Head the Department of Psychiatry & De-Addiction at RML Hospital. Chair Yoga for health Offering relief to patients who cannot sit on floor, Delhi's Sir Ganga Ram Hospital has introduced a new yoga initiative called 'chair yoga' in which the patients can remain seated on a chair and perform health postures. Dr Soina Rawat, director at the department of executive health check up at the hospital, said,A large number of patients were facing problems in doing yoga because of various disabilities. They wanted to practice yoga but couldnt sit on the floor. So we started the concept of chair yoga at our hospital. It is really helpful and their health status has improved. Binda Shukla, 50, is one patient who has been practicing chair yoga for several months. She suffers from upper body stiffness, and has cervical and cholesterol problems. There is definitely relief ever since I started doing meditation and yoga. I feel fresh when I perform deep breathing asanas and am taking less medication now. Even those who have knee joint related complications are comfortable with chair yoga, says Dr Shukla To see whether yoga can improve cognitive functions of patients with severe mental disorder as compared to cardiac controls and to compare improvements in patients with schizophrenia. Time period: Samples of last two years in the study included patients with depression, bipolar disorder, schizophrenia and cardiac controls. Patients were aged between 18-60. Another study was conducted to ascertain randomised control trail on schizophrenia on yoga and non-yoga groups. Exercises: Yoga training included chanting of Om, warm up exercises, breathing (pranayama), various yogic asanas Results: There was an improvement in the speed parameters of most of the cognitive domains among patients with schizophrenia after yoga, compared with the schizophrenia non-yoga group patients. The healthy parameters sustained for six months at least. Tags :Follow YogaFollow International Yoga Day POST A COMMENT READ THIS BJP chief JP Nadda Top BJP leaders take to social media to condemn tweets on farmers' stir by Rihanna, Greta The efficacy of double masking: What health experts have to say Silenced Minority? Airtel, Jio, Vi best prepaid plans with streaming and data benefits under Rs 500 RECOMMENDED WATCH RIGHT NOW 01:00 Watch: Arvind Kejriwal takes first dose of Covid vaccine 02:27 Bengal assembly polls: Mamata Banerjee to file nomination from Nandigram on March 11 00:49 Bengali singer Aditi Munshi joins Trinamool Congress 02:57 Karnataka minister Ramesh Jarkiholi resigns after sex CD scandal 02:01 Good news: Prisoners beautify jail compound in UP's Maharajganj TOP TAKES EC directs petrol pumps to remove hoardings showing PM Modi's photos01:10 EC directs petrol pumps to remove hoardings showing PM Modi's photos BJP calls Rahul Gandhi's push-up challenge violation of code of conduct, writes to EC00:46 BJP calls Rahul Gandhi's push-up challenge violation of code of conduct, writes to EC Delhi Police foils plan to kill 2 Delhi riots accused in Tihar Jail02:30 Delhi Police foils plan to kill 2 Delhi riots accused in Tihar Jail INDIATODAY.IN Download App Andriod AppIOS AppSmartTv App Copyright © 2021 Living Media India Limited. For reprint rights: Syndications Today Covid-19 pandemic: 5 things one should know about heart failure Covid-19 pandemic: 5 things one should know about heart failure KCR's Backward Classes Challenge KCR's Backward Classes Challenge What genes tell us about the risk of developing cancer What genes tell us about the risk of developing cancer Redefining heart health with cutting edge technologies Redefining heart health with cutting edge technologies How yoga can help you to get rid of stress headache How yoga can help you to get rid of stress headache Designer sarees to mark your distinctive impression Designer sarees to mark your distinctive impression Designer sarees to mark your distinctive impression Designer sarees to mark your distinctive impression Do's and dont's to keep in mind while opting for IVF treatment: Here's all you need to know Do's and dont's to keep in mind while opting for IVF treatment: Here's all you need to know Be a style icon with these groovy women handbags Be a style icon with these groovy women handbags India's Avant-Garde India's Avant-Garde"
s = preprocess(s)
s = tfidf.transform([s])
mlb.inverse_transform(knnClf.predict(s))

[('-',
  'Diseases and Conditions',
  'Heart and Cardiovascular Diseases',
  'Medical Health')]

### using with_justext column to train the classifier

In [12]:
tfidf = TfidfVectorizer()
text_count = tfidf.fit_transform(df['with_justext'])
y = np.asarray(df[df.columns[9:13]])

In [13]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(text_count, y, test_size = 0.25, random_state = 5)

In [14]:
len(mlb.classes_)

1177

In [15]:
knnClf = KNeighborsClassifier()
knnClf.fit(x_train, y_train)
knnpred = knnClf.predict(x_test)

In [16]:
print('accuracy_score_KNN', metrics.accuracy_score(knnpred, y_test))
print('recall_macro_score', metrics.recall_score(knnpred, y_test,average = 'macro', zero_division = 1))
print('recall_micro_score', metrics.recall_score(knnpred, y_test,average = 'micro', zero_division = 1))
print('recall_weighted_score', metrics.recall_score(knnpred, y_test,average = 'weighted', zero_division = 1))
print('f1_macro_score', metrics.f1_score(knnpred, y_test,average = 'macro', zero_division = 1))
print('f1_micro_score', metrics.f1_score(knnpred, y_test,average = 'micro', zero_division = 1))
print('f1_weighted_score', metrics.f1_score(knnpred, y_test,average = 'weighted', zero_division = 1))
print('precision_macro_score', metrics.precision_score(knnpred, y_test,average = 'macro', zero_division = 1))
print('precision_micro_score', metrics.precision_score(knnpred, y_test,average = 'micro', zero_division = 1))
print('precision_weighted_score', metrics.precision_score(knnpred, y_test,average = 'weighted', zero_division = 1))

accuracy_score_KNN 0.3492844364937388
recall_macro_score 0.9158501800775894
recall_micro_score 0.8742857142857143
recall_weighted_score 0.8742857142857143
f1_macro_score 0.46520555220221826
f1_micro_score 0.7591390003648304
f1_weighted_score 0.8340425572171611
precision_macro_score 0.4617068376447794
precision_micro_score 0.6707930367504835
precision_weighted_score 0.8257457167114176


In [17]:
s = "Jump to navigation NEWSLIVE TV Home APPMAGAZINE HOMEMY FEEDVIDEOSMALAYALAMINDIAGAMINGFACT CHECKQUIZMOVIESHEALTHTECHSPORTSDIU NewsLifestyleYoga can help you keep heart diseases at bay Yoga can help you keep heart diseases at bay Yoga can become a way of life to help you deal with heart diseases. Priyanka Sharma New Delhi June 21, 2018UPDATED: June 21, 2018 18:59 IST Chair posture is proving beneficial for patients who can’t sit on the floor or have disabilities Chair posture is proving beneficial for patients who can’t sit on the floor or have disabilities Even in matters of the heart, literally speaking, yoga has an impact. A study by the All India Institute of Medical Sciences (AIIMS) has established that a person can regulate her/his heartbeats through yoga. AIIMS, in what it claimed to be the first study of its kind to have scientifically established the benefits of yoga, revealed that breathing in a particular rhythm has many positive effects on cardiovascular health. It also helps in overcoming anger, fear, stress and hypertension. Dr KK Deepak, author of the study and head of the Department of Physiology at AIIMS, told Mail Today: A slow yogic breathing can curtail the feeling of stress, fear, anger, tension and also regulate diabetes. The study has found that there is a correlation between heartbeat and blood pressure. Yogic breathing can synchronise the two which is a big deal.  The results of the study, funded by Indian Council for Medical Research (ICMR) and the Ministry of AYUSH, have been published in the latest issue of Indian Journal of Medical Research. When we do deep-breathing, blood pressure comes into the normal range. This indicates that there is a strong relationship between heart rate and blood pressure. If a persons heart rate falls, suddenly the BP will go up and if the BP comes down, the heart rate shoots up. Thus, creating a balance between the two is very important - which can be done by yogic breathing,  Dr Deepak said. In another study conducted by RML Hospital, doctors have found that practising asanas led to significant improvement in patients suffering from mental health illnesses, as compared with those who did other forms of exercise. We made two groups of patients yoga and non-yoga. The yoga group patients cognitive behaviour improved tremendously as compared to the patients doing other exercises, said (Prof) Smita N. Deshpande, Head the Department of Psychiatry & De-Addiction at RML Hospital. Chair Yoga for health Offering relief to patients who cannot sit on floor, Delhi's Sir Ganga Ram Hospital has introduced a new yoga initiative called 'chair yoga' in which the patients can remain seated on a chair and perform health postures. Dr Soina Rawat, director at the department of executive health check up at the hospital, said,A large number of patients were facing problems in doing yoga because of various disabilities. They wanted to practice yoga but couldnt sit on the floor. So we started the concept of chair yoga at our hospital. It is really helpful and their health status has improved. Binda Shukla, 50, is one patient who has been practicing chair yoga for several months. She suffers from upper body stiffness, and has cervical and cholesterol problems. There is definitely relief ever since I started doing meditation and yoga. I feel fresh when I perform deep breathing asanas and am taking less medication now. Even those who have knee joint related complications are comfortable with chair yoga, says Dr Shukla To see whether yoga can improve cognitive functions of patients with severe mental disorder as compared to cardiac controls and to compare improvements in patients with schizophrenia. Time period: Samples of last two years in the study included patients with depression, bipolar disorder, schizophrenia and cardiac controls. Patients were aged between 18-60. Another study was conducted to ascertain randomised control trail on schizophrenia on yoga and non-yoga groups. Exercises: Yoga training included chanting of Om, warm up exercises, breathing (pranayama), various yogic asanas Results: There was an improvement in the speed parameters of most of the cognitive domains among patients with schizophrenia after yoga, compared with the schizophrenia non-yoga group patients. The healthy parameters sustained for six months at least. Tags :Follow YogaFollow International Yoga Day POST A COMMENT READ THIS BJP chief JP Nadda Top BJP leaders take to social media to condemn tweets on farmers' stir by Rihanna, Greta The efficacy of double masking: What health experts have to say Silenced Minority? Airtel, Jio, Vi best prepaid plans with streaming and data benefits under Rs 500 RECOMMENDED WATCH RIGHT NOW 01:00 Watch: Arvind Kejriwal takes first dose of Covid vaccine 02:27 Bengal assembly polls: Mamata Banerjee to file nomination from Nandigram on March 11 00:49 Bengali singer Aditi Munshi joins Trinamool Congress 02:57 Karnataka minister Ramesh Jarkiholi resigns after sex CD scandal 02:01 Good news: Prisoners beautify jail compound in UP's Maharajganj TOP TAKES EC directs petrol pumps to remove hoardings showing PM Modi's photos01:10 EC directs petrol pumps to remove hoardings showing PM Modi's photos BJP calls Rahul Gandhi's push-up challenge violation of code of conduct, writes to EC00:46 BJP calls Rahul Gandhi's push-up challenge violation of code of conduct, writes to EC Delhi Police foils plan to kill 2 Delhi riots accused in Tihar Jail02:30 Delhi Police foils plan to kill 2 Delhi riots accused in Tihar Jail INDIATODAY.IN Download App Andriod AppIOS AppSmartTv App Copyright © 2021 Living Media India Limited. For reprint rights: Syndications Today Covid-19 pandemic: 5 things one should know about heart failure Covid-19 pandemic: 5 things one should know about heart failure KCR's Backward Classes Challenge KCR's Backward Classes Challenge What genes tell us about the risk of developing cancer What genes tell us about the risk of developing cancer Redefining heart health with cutting edge technologies Redefining heart health with cutting edge technologies How yoga can help you to get rid of stress headache How yoga can help you to get rid of stress headache Designer sarees to mark your distinctive impression Designer sarees to mark your distinctive impression Designer sarees to mark your distinctive impression Designer sarees to mark your distinctive impression Do's and dont's to keep in mind while opting for IVF treatment: Here's all you need to know Do's and dont's to keep in mind while opting for IVF treatment: Here's all you need to know Be a style icon with these groovy women handbags Be a style icon with these groovy women handbags India's Avant-Garde India's Avant-Garde"
s = preprocess(s)
s = tfidf.transform([s])
mlb.inverse_transform(knnClf.predict(s))

[('-',
  'Diseases and Conditions',
  'Heart and Cardiovascular Diseases',
  'Medical Health')]

In [20]:
f = open('../Data/IAB/IAB_binarizer_2.p', 'wb')
pickle.dump(mlb, f)
f.close()

In [21]:
f = open('../Data/IAB/IAB_vectorizer_2.p', 'wb')
pickle.dump(tfidf, f)
f.close()

In [22]:
f = open('../Data/IAB/IAB_classifier_2.p', 'wb')
pickle.dump(knnClf, f)
f.close()

In [24]:
import sklearn
sklearn.__version__

'0.23.2'