In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import string
import collections
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import gensim
import sklearn.feature_extraction.text as sktext
import re


In [None]:
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.tokenize import word_tokenize

STOPWORDS = set(stopwords.words('english'))

In [None]:
data = pd.read_csv('/kaggle/input/million-headlines/abcnews-date-text.csv',nrows=50000)
data.head()

In [None]:
#dataset info

tmp = []
for col in data.columns:
    tmp.append([col, type(data[col][0]), data[col].isnull().sum(), data[col].nunique()])

df = pd.DataFrame(data=tmp, columns=['column_name','Datatype', 'null_count', 'unique_count'])
df.insert(2,'non_null_count', len(data)-df['null_count'])
df

In [None]:
data['word_count'] = data['headline_text'].apply(lambda x: len(str(x).split()))
data['unique_word_count'] = data['headline_text'].apply(lambda x: len(set(str(x).split())))
data['stop_word_count'] = data['headline_text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
data['mean_word_length'] = data['headline_text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
data['char_count'] = data['headline_text'].apply(lambda x: len(str(x)))
data['punctuation_count'] = data['headline_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

In [None]:
METAFEATURES = ['word_count', 'unique_word_count', 'stop_word_count', 'mean_word_length',
                'char_count', 'punctuation_count']


In [None]:
p =data[METAFEATURES].hist(figsize = (20,30), grid=False, bins=10)

In [None]:
def generate_top_ngrams(data, n=None, N=100):
    vec = sktext.CountVectorizer(ngram_range=(n, n)).fit(data)
    bag_of_words = vec.transform(data)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) 
                  for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

    return pd.DataFrame(words_freq[:N])

In [None]:
#Unigrams
data_unigrams = generate_top_ngrams(data['headline_text'],n=1)

fig, axis = plt.subplots(ncols=1, nrows=1, figsize=(20, 40), dpi=100)
sns.barplot(y=data_unigrams[0], x=data_unigrams[1])
plt.title('Top 100 unigrams in news headlines')
plt.show()

In [None]:
#Bigrams
data_bigrams = generate_top_ngrams(data['headline_text'], n=2)

fig, axis = plt.subplots(ncols=1, nrows=1, figsize=(20, 40), dpi=100)
sns.barplot(y=data_bigrams[0], x=data_bigrams[1])
plt.title('Top 100 bigrams in news headlines')
plt.show()

In [None]:
#Trigrams
data_trigrams = generate_top_ngrams(data['headline_text'], n=3)

fig, axis = plt.subplots(ncols=1, nrows=1, figsize=(20, 40), dpi=100)
sns.barplot(y=data_trigrams[0], x=data_trigrams[1])
plt.title('Top 100 bigrams in news headlines')
plt.show()

In [None]:
corpus=[]
corpus = [w for s in data['headline_text'].str.split().values.tolist() for w in s]


In [None]:
counter = collections.Counter(corpus)
most_common_words = counter.most_common()

x, y= [], []
for word,count in most_common_words[:50]:
    if (word not in STOPWORDS):
        x.append(word)
        y.append(count)

fig, axis = plt.subplots(ncols=1, nrows=1, figsize=(20, 10), dpi=100)
sns.barplot(x=y,y=x)
plt.title('Most common words in corpus (except stopwords)')
plt.show()

In [None]:
x, y= [], []
for word,count in most_common_words[:50]:
    if (word in STOPWORDS):
        x.append(word)
        y.append(count)
        
fig, axis = plt.subplots(ncols=1, nrows=1, figsize=(20, 10), dpi=100)
sns.barplot(x=y,y=x)
plt.title('Most common stop words in corpus')
plt.show()

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(
    background_color='white',
    stopwords=set(STOPWORDS),
    max_words=100,
    max_font_size=100, 
    scale=3,
    random_state=1)

In [None]:
wordcloud=wordcloud.generate(str(corpus))
fig = plt.figure(1, figsize=(12, 12))
plt.axis('off')
plt.imshow(wordcloud)
plt.show()

In [None]:
def _preprocess_text(text):
    corpus=[]
    stem=PorterStemmer()
    lem=WordNetLemmatizer()
    for news in text:
        words=[w for w in word_tokenize(news) if (w not in STOPWORDS)]
        words=[lem.lemmatize(w) for w in words if len(w)>2]
        corpus.append(words)
    return corpus

corpus_processed=_preprocess_text(corpus)

In [None]:
wordcloud=wordcloud.generate(str(corpus_processed))
fig = plt.figure(1, figsize=(12, 12))
plt.axis('off')
plt.imshow(wordcloud)
plt.show()