# A Million News Headlines

**This** contains data of news headlines published over a period of 15 years.

Sourced from the reputable Australian news source ABC (Australian Broadcasting Corp.)

Agency Site: http://www.abc.net.au/

## Data Description
- Format: CSV ; Single File

- publish_date: Date of publishing for the article in yyyyMMdd format
- headline_text: Text of the headline in Ascii , English , lowercase
- Start Date: 2003-02-19 End Date: 2017-12-31

Total Records: 1,103,663

Feed-Code: w3-event-abcaus; Si.gh.rank: SND

## Import Module

In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm_notebook as tqdm

from tqdm import tqdm
import os

## Data Import

In [None]:
data = pd.read_csv("../input/million-headlines/abcnews-date-text.csv")
data.head()

In [None]:
data_sort = data.sort_values("publish_date",axis=0,ascending=True,kind='quicksort',na_position='last')
data_sort.head()

## Data Processing

In [None]:
from bs4 import BeautifulSoup
#Remove HTML tags and URL from the reviews.
def html_tag(phrase):
    http_remove = re.sub(r"http\S+", "",phrase)
    html_remove = BeautifulSoup(http_remove, 'lxml').get_text()
    return html_remove

In [None]:
import re
#remove words with numbers python: https://stackoverflow.com/a/18082370/4084039
#remove spacial character: https://stackoverflow.com/a/5843547/4084039
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
# <br /><br /> ==> after the above steps, we are getting "br br"
# we are including them into stop words list
# instead of <br /> if we have <br/> these tags would have revmoved in the 1st step
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [None]:
processed_text = []
for i in tqdm(data["headline_text"].values):
    sentance = html_tag(i)
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance)
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    sentance = " ".join(i.lower() for i in sentance.split() if i.lower() not in stopwords)
    processed_text.append(sentance)

In [None]:
data["Clean_text"] = processed_text
data.head()

# Word-Cloud

In [None]:
def word_cloud(cluster_num):
    sentance = []
    num = cluster_num
    sent = final_data["Clean_text"][final_data["labels"]==num]
    for i in sent:
        sentance.append(i)
    sentance = ''.join(sentance)
    wordcloud = WordCloud(background_color="white").generate(sentance)
    print(f"Cluster Number: {num}")
    plt.figure(figsize=(9,6))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

## Bag-Of-Word (BOW)

In [None]:
from wordcloud import WordCloud
from sklearn.cluster import KMeans
final_data = data[0:100000]
bow = CountVectorizer(ngram_range=(1,2))
bow_vector = bow.fit_transform(final_data["Clean_text"])

clusters = [2,3,4,5,6,7,8,9]
inertia = []
for i in tqdm(clusters):
    k_mean= KMeans(n_clusters=i,n_init=10)
    k_mean.fit(bow_vector)
    inertia.append(k_mean.inertia_)

In [None]:
plt.figure(figsize=(9,6))
sns.set_style(style="whitegrid")
sns.lineplot(clusters,inertia)
plt.xlabel("No of clusters",fontsize=12)
plt.ylabel("Loss",fontsize=12)
plt.title("Error plot for various no of clusters",fontsize=14)
plt.show()

In [None]:
k_mean = KMeans(n_clusters=6,n_init=10)
k_mean.fit(bow_vector)
final_data["labels"] = k_mean.labels_

In [None]:
def NWordCloud(n_clusters):
    print("Number of clusters: {}".format(n_clusters))
    
    plt.figure(figsize=(15,15))
    
    rows = int((n_clusters/2)+1) if type(n_clusters/2)==float else n_clusters/2
    for i in range(0,n_clusters):
        sentance = []
        num = i
        sent = final_data["Clean_text"][final_data["labels"]==num]
        for j in sent:
            sentance.append(j)
        sentance = ''.join(sentance)
        wordcloud = WordCloud(background_color="white").generate(sentance)
        plt.subplot(rows, 2, i+1)
        plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

In [None]:
NWordCloud(6)

## TFIDF

In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. The tf-idf value increases proportionally to the number of times a word appears in the document and is offset by the frequency of the word in the corpus, which helps to adjust for the fact that some words appear more frequently in general. Nowadays, tf-idf is one of the most popular term-weighting schemes; 83% of text-based recommender systems in the domain of digital libraries use tf-idf.

In [None]:
tfidf = TfidfVectorizer()
tfidf_vector = tfidf.fit_transform(final_data["Clean_text"])

clusters = [2,3,4,5,6,7,8,9]
inertia = []
for i in tqdm(clusters):
    k_mean= KMeans(n_clusters=i,n_init=10)
    k_mean.fit(tfidf_vector)
    inertia.append(k_mean.inertia_)

In [None]:
plt.figure(figsize=(9,6))
sns.set_style(style="whitegrid")
sns.lineplot(clusters,inertia)
plt.xlabel("No of clusters",fontsize=12)
plt.ylabel("Loss",fontsize=12)
plt.title("Error plot for various no of clusters",fontsize=14)
plt.show()

In [None]:
final_data.drop(columns='labels',axis=1,inplace=True)
k_mean = KMeans(n_clusters=8,n_init=10)
k_mean.fit(tfidf_vector)
final_data["labels"] = k_mean.labels_

unique_labels = list(final_data["labels"].unique())
unique_labels.sort()

In [None]:
NWordCloud(8)

## Average Word2Vector

In [None]:
list_of_sentance=[]
for sentance in tqdm(final_data["Clean_text"]):
    list_of_sentance.append(sentance.split())
x = final_data["Clean_text"]
print(f"Shape of X Train : {x.shape}")
w2v_model=Word2Vec(list_of_sentance,min_count=5,size=50, workers=4)
w2v_words = list(w2v_model.wv.vocab)
w2v_vector = []
for sent in tqdm(list_of_sentance):
    word_count = 0
    word_vector = np.zeros(50)
    for words in sent:
        if words in w2v_words:
            word_count +=1 
            each_word_vect = w2v_model.wv[words]
            word_vector += each_word_vect
    if word_count != 0: 
        word_vector /= word_count
    w2v_vector.append(word_vector)
print(f"Length of w2v_vector: {len(w2v_vector)}")

clusters = [2,3,4,5,6,7,8,9]
inertia = []
for i in tqdm(clusters):
    k_mean= KMeans(n_clusters=i,n_init=10)
    k_mean.fit(w2v_vector)
    inertia.append(k_mean.inertia_)

In [None]:
plt.figure(figsize=(9,6))
sns.set_style(style="whitegrid")
sns.lineplot(clusters,inertia)
plt.xlabel("No of clusters",fontsize=12)
plt.ylabel("Loss",fontsize=12)
plt.title("Error plot for various no of clusters",fontsize=14)
plt.show()

In [None]:
final_data.drop(columns='labels',axis=1,inplace=True)
k_mean = KMeans(n_clusters=5,n_init=10)
k_mean.fit(w2v_vector)
final_data["labels"] = k_mean.labels_
unique_labels = list(final_data["labels"].unique())
unique_labels.sort()

In [None]:
NWordCloud(5)

## TFIDF Weighted V2W

In [None]:
list_of_sentance=[]
for sentance in tqdm(final_data["Clean_text"]):
    list_of_sentance.append(sentance.split())
x = final_data["Clean_text"]
print(f"Shape of final_data: {x.shape}")

model = TfidfVectorizer()
model.fit(final_data["Clean_text"])
dictionary = dict(zip(model.get_feature_names(), list(model.idf_)))
tfidf_feat = model.get_feature_names() 
w2v_model=Word2Vec(list_of_sentance,min_count=5,size=50, workers=4)
w2v_words = list(w2v_model.wv.vocab)


tfidf_w2v_vector = [];
for sent in tqdm(list_of_sentance): 
    sent_vec = np.zeros(50)
    weight_sum =0;
    for word in sent:
        if word in w2v_words and word in tfidf_feat:
            vec = w2v_model.wv[word]
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_w2v_vector.append(sent_vec)
print(f"Length of tfidf_w2v_vector {len(tfidf_w2v_vector)}")

clusters = [2,3,4,5,6,7,8,9]
inertia = []
for i in tqdm(clusters):
    k_mean= KMeans(n_clusters=i,n_init=10)
    k_mean.fit(tfidf_w2v_vector)
    inertia.append(k_mean.inertia_)

In [None]:
plt.figure(figsize=(9,6))
sns.set_style(style="whitegrid")
sns.lineplot(clusters,inertia)
plt.xlabel("No of clusters",fontsize=12)
plt.ylabel("Loss",fontsize=12)
plt.title("Error plot for various no of clusters",fontsize=14)
plt.show()

In [None]:
k_mean = KMeans(n_clusters=5,n_init=10)
k_mean.fit(tfidf_w2v_vector)
final_data["labels"] = k_mean.labels_
unique_labels = list(final_data["labels"].unique())
unique_labels.sort()

In [None]:
NWordCloud(5)