In [1]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.corpus import stopwords
from newsplease import NewsPlease
from bs4 import BeautifulSoup
from nltk.tree import Tree
from string import digits
from nltk import tokenize
from requests import get
import pandas as pd
import nltk, string
import sys



url = 'https://timesofindia.indiatimes.com/topic/hiv/'
remove_digits = str.maketrans('', '', digits)

# Souping function
def Soup(url):
    response = get(url)
    return BeautifulSoup(response.text, 'lxml')

# Scraping total no.of pages contians in TimesOfIndia website about HIV articles
def pagecount(url):
    soup_obj = Soup(url)
    return int(soup_obj.select('a.look')[-1].text)

# Extract links from each page to scrap about HIV articles
def Get_links(url):
    links = []
    for count in range(pagecount(url)):
        soup_obj = Soup(url + str(count))
        block = ['https://timesofindia.indiatimes.com/topic' + item.a['href'] for item in soup_obj.select('.content')]
        links.extend(block)
    links = list(set(links))
    return links

# Prepares a data frame with the required attributes
def Df_builder(links):
    titles, publish_dates, authors, urls, text = [], [], [], [], []
    for index, item in enumerate(links):
        # Appending Article Content
        try:
            sys.stdout.write("\r" + str(index) + " : " +"Articles has Extracted" + "\r")
            sys.stdout.flush()
            article = NewsPlease.from_url(item)
            text.append(article.text)
        except:
            continue
    data_frame = pd.DataFrame({"Text" : text})
    return data_frame

# Extracting short phrases from text
def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue
    return continuous_chunk

# Filtering text from removing the punctuation, digits, stopwords from it
def get_filtered_text(text):
    lowers = text.lower()
    ent = get_continuous_chunks(text)
    k = []
    for e in ent:
        for ea in e.lower().split():
            k.append(ea)
    resultwords  = [i for i in lowers.split() if i not in k]
    res = ' '.join(resultwords)
    res = res.translate(remove_digits)
    table = str.maketrans({key: None for key in string.punctuation}) 
    no_punctuation = res.translate(table)
    tokens = nltk.word_tokenize(no_punctuation)
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    return filtered

Wall time: 6.37 s


In [2]:
%%time
links = Get_links(url)

Wall time: 29.6 s


In [3]:
df = Df_builder(links)

289 : Articles has Extracted



398 : Articles has Extracted

In [4]:
%%time
data = df.dropna()
corpus = list(data['Text'])
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, use_idf=True, tokenizer=get_filtered_text)
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus) #fit the vectorizer to synopses

Wall time: 2min 38s


In [5]:
# matrix shape
print(tfidf_matrix.shape)

(391, 58)


In [6]:
# Bag of Words feature extraction
print(tfidf_vectorizer.get_feature_names())

['aids', 'also', 'among', 'blood', 'cases', 'centre', 'control', 'could', 'day', 'department', 'director', 'disease', 'district', 'dr', 'even', 'family', 'first', 'found', 'get', 'government', 'health', 'hospital', 'however', 'including', 'india', 'infected', 'infection', 'last', 'like', 'living', 'made', 'many', 'medical', 'national', 'new', 'number', 'one', 'patients', 'people', 'positive', 'said', 'since', 'society', 'state', 'take', 'tested', 'three', 'time', 'treatment', 'two', 'virus', 'woman', 'would', 'year', 'years', '’', '“', '”']
