In [6]:
import pandas as pd, numpy as np
import re, nltk
from string import punctuation
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords 
import warnings
warnings.filterwarnings('ignore')

# Loading the dataset
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text
# Analyse the data
# ..................
# tokenising the sentences
stp_wrds = stopwords.words('english')
# clean the text
def clean_review(input_text):
    out_text = re.sub(r'@\w+', '', input_text)
    out_text = re.sub(r'https?://[A-Za-z0-9./]+', '', out_text)
    out_text = re.sub('www?://[A-Za-z0-9./]+','',out_text)
    out_text = out_text.lower()
    return out_text
def removeStopWords(text):
    stopwords_list = set(nltk.corpus.stopwords.words('english') + list(punctuation))
    whitelist = ["n't", "not", "no"]
    words = text.split()
    clean_words = [words for word in words if (word not in stopwords_list or word in whitelist)]
    out = ''.join(map(str, clean_words))
    return out 
def stemText(text):
    porter = PorterStemmer()
    words = text.split()
    stemmed_words = [porter.stem(word) for word in words]
    return " ".join(stemmed_words)
# Define functions to sanitize the data
# 2. Check the hygiene of data and sanitize it.
# ............................
documents['clean_text'] = documents['headline_text'].apply(clean_review).apply(removeStopWords).apply(stemText)

# Vectorize the data and create a Document Term Matrix (DTM)
# 3. Create DTM using following parameters:
count_vect = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, ngram_range = (1,2), stop_words='english' )    
vect = count_vect.fit_transform(documents['clean_text'])
# ...........................

# Fit a LDA model on Document Term Matrix created above
# 4. Fit a LDA model with 5 components
# n_components = 5
# ...........................
lat_mod = LatentDirichletAllocation(n_components = 5)
model = lat_mod.fit_transform(vect)
# Vizualize the topics generated
# 5. Create a report
# **************************
import mglearn as mg  
sorting = np.argsort(lat_mod.components_)[:,::-1] 
features = np.array(count_vect.get_feature_names()) 
topics = mg.tools.print_topics(topics=range(5), feature_names=features, sorting=sorting, topics_per_chunk=5, n_words=25)
print(topics)


topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
court         new           man           killed        police        
man           health        interview     home          govt          
australia     world         accused       school        water         
council       south         says          boost         nsw           
crash         hospital      trial         year          sa            
charged       woman         open          help          qld           
day           cup           police        melbourne     calls         
win           north         plans         market        sydney        
set           minister      plan          power         death         
missing       attack        final         farmers       rural         
election      dead          test          house         mp            
wa            west          budget        adelaide      talks         
wins  