In [0]:
import requests
from bs4 import BeautifulSoup

In [0]:
def get_headlines():
    
    def extract_hindustantimes(WebUrl,lis, value): #lis is a dict containing all the headings
        url = WebUrl
        code = requests.get(url)
        plain = code.text
        s = BeautifulSoup(plain, "html.parser")
        for link in s.findAll('div', {'class':'media-heading headingfour'}):
            for link2 in link.findAll('a'):
                title = link2.get('title')
                lis[title] = value
    # Getting all the viral headlines
    viral = {}
    extract_hindustantimes('https://www.hindustantimes.com/it-s-viral/',viral,1)
    for i in range(2,15):
        url = 'https://www.hindustantimes.com/it-s-viral/?pageno=' + str(i)
        extract_hindustantimes(url,viral,1)
    print('Number of viral news headlines: ',len(viral))

    # Getting all the non-viral headlines
    non_viral = {}
    extract_hindustantimes('https://www.hindustantimes.com/latest-news/',non_viral,0)
    for i in range(2,55):
        url = 'https://www.hindustantimes.com/latest-news/?pageno=' + str(i)
        extract_hindustantimes(url,non_viral,0)

    # Remove the viral news headlines from the non_viral dictionary
    del_keys = []
    for key in non_viral:
        if key in set(viral.keys()): # Using a set speeds up the process considerably
            del_keys.append(key)
    for key in del_keys:
        del non_viral[key]
    print('Number of non-viral news headlines: ',len(non_viral))
    
    return viral,non_viral

In [3]:
viral, non_viral = get_headlines()

Number of viral news headlines:  221
Number of non-viral news headlines:  750


In [0]:
import numpy as np
import pandas as pd

data = pd.DataFrame(viral.items(), columns = ['headlines','label'])
data = data.append(pd.DataFrame(non_viral.items(), columns = ['headlines','label']))

In [5]:
# Shuffling the dataset
data = data.sample(frac=1,  random_state=10).reset_index(drop=True)
train = data[:-100]
train.to_csv('train.csv', index = False)
test = data[-100:].reset_index(drop = True)
train.to_csv('test.csv', index = False)
test

Unnamed: 0,headlines,label
0,Wuhan’s L-strain may be behind Gujarat’s high ...,0
1,It would be ‘golden opportunity’ to learn from...,0
2,Employees of private firms in Gurugram may hav...,0
3,Education department braces for challenges in ...,0
4,"Plane lands in the middle of busy highway, peo...",1
...,...,...
95,"‘Need to strengthen economic activities, comba...",0
96,Will India Couture Week 2020 be a reality?,0
97,"We needed a break, but not in this way: Rinku ...",0
98,‘You and I will meet again’: Mumbai Police use...,1


In [6]:
import nltk
nltk.download('popular')
import string
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.stem import WordNetLemmatizer

stop_wds = set(stopwords.words('english'))

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [0]:
def preprocess_text(corpus):
    
    lemmatizer = WordNetLemmatizer()
    word_freq = {}
    sentences_tokenized = []

    for sentence in corpus:
        # Removing all the punctuations in the text
        sentence2 = sentence.translate({ord(c):'' for c in string.punctuation + "‘’"})
        words = wordpunct_tokenize(sentence2.lower())
        # Grouped similar words together to decrease noise in the data
        for i in range(len(words)):
            words[i] = lemmatizer.lemmatize(words[i])
            if words[i] not in set(word_freq.keys()):
                word_freq[words[i]] = 1
            else:
                word_freq[words[i]] += 1
                
        sentences_tokenized.append(words)
    # word_freq contains the frequency of each word
    # I have filtered out the words which occur only once so that noise is reduced
    word_freq = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True))

    most_freq = {}
    for key in word_freq:
        if word_freq[key]>1:
            most_freq[key] = word_freq[key]
            

    processed_sent = []
    for sent in sentences_tokenized:
        useful_words = [word for word in sent if word in most_freq]
        processed_sent.append(" ".join(useful_words))
    return processed_sent
        
train_processed_sentences = preprocess_text(train['headlines'])
test_processed_sentences = preprocess_text(test['headlines'])

Here I have used the TF-IDF Vectorizer to vectorize the words in the dataset

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
train_final = vect.fit_transform(train_processed_sentences)
test_final = vect.transform(test_processed_sentences)


In [0]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_final, train['label'], test_size=0.2, random_state=42)

In [10]:
!pip install catboost



CatBoost model was used since it gave the best performance with a Validation Accuracy = 87.4% and F1 Score = 0.725

In [11]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

model = CatBoostClassifier(eval_metric='F1', iterations=1000, max_depth=6, learning_rate=0.03)
# model = LGBMClassifier(n_estimators=1100, eval_metric = 'F1')
model.fit(X_train, y_train, eval_set=[(X_val,y_val)], verbose=0)
y_pred = model.predict(X_val)
print('Validation Accuracy : ',  model.score(X_val,y_val))
print('F1 score : ', f1_score(y_val,y_pred))

Validation Accuracy :  0.8742857142857143
F1 score :  0.7250000000000001


In [0]:
predictions = model.predict_proba(test_final)
result = pd.DataFrame()
result['headlines'] = test['headlines']
result['virality probability'] = predictions[:,1]*100
result['actual virality'] = ['Yes' if i==1 else "No" for i in test['label']]

In [13]:
result.head()

Unnamed: 0,headlines,virality probability,actual virality
0,Wuhan’s L-strain may be behind Gujarat’s high ...,3.25253,No
1,It would be ‘golden opportunity’ to learn from...,10.054348,No
2,Employees of private firms in Gurugram may hav...,1.946367,No
3,Education department braces for challenges in ...,16.057338,No
4,"Plane lands in the middle of busy highway, peo...",85.690024,Yes


In [0]:
result.to_csv('test_results.csv', index=False)