# Model 2

In [1]:
# imports
import sys
import os
import numpy as np
import pandas as pd
import ujson
from sklearn.model_selection import train_test_split
from nltk import FreqDist
from nltk.util import ngrams
import re
import spacy
import math

import gensim
from gensim import corpora
from gensim.models import CoherenceModel

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

## Model

## Dataset

For this experiment we used a controlled dataset of just blogs entries from one blog site: https://blog.americanchemistry.com/. The American Chemistry Blog has content related to to Chemical regulation, Energy, Sustainability. 

In [2]:
# Data file
data_file ='../../../Datasets/selected/npr.org.xlsx'
source_data = pd.read_excel(data_file)

print("Shape:",source_data.shape)
source_data.head()

Shape: (7152, 4)


Unnamed: 0,article_date,article_title,article_content,article_url
0,2019-09-14,Air Ambulances Woo Rural Consumers With Member...,Visitors and park rangers at historic Fort Sco...,https://www.npr.org/sections/health-shots/2019...
1,2019-09-09,Esketamine Nasal Spray Eases Depression Sympto...,"The depression drug esketamine, marketed as Sp...",https://www.npr.org/sections/health-shots/2019...
2,2019-09-16,'Tip Of The Iceberg' ‚Äî 1 In 16 Women Reports...,A survey of women ages 18 to 44 found that for...,https://www.npr.org/sections/health-shots/2019...
3,2019-09-13,How A Proposed 3-Digit Suicide Hotline Could H...,"With suicides on the rise, the government want...",https://www.npr.org/sections/health-shots/2019...
4,2019-09-12,Untreated Hearing Loss Linked To Loneliness An...,Untreated hearing loss increases the risks of ...,https://www.npr.org/sections/health-shots/2019...


In [3]:
source_data['word_count'] = source_data['article_content'].str.split().str.len()

# View some metrics of data
print("Number of Blogs:",f'{source_data.shape[0]:,}')
print("Minimum Article Date:",min(source_data['article_date']).strftime("%b %d %Y"))
print("Maximum Article Date:",max(source_data['article_date']).strftime("%b %d %Y"))
print("Minimum Word Count:",min(source_data['word_count']))
print("Maximum Word Count:",f'{max(source_data["word_count"]):,}')

Number of Blogs: 7,152
Minimum Article Date: Jan 20 2012
Maximum Article Date: Sep 16 2019
Minimum Word Count: 1
Maximum Word Count: 4,773


## Data Preprocessing

The data preprocessing steps that we will follow inorder to feed the data to the topic model are:
- Combine Title with Blog Content
- Remove line breaks
- Remove Special Characters
- Remove small words < 3 letters
- Convert text to lowercase
- Remove stop words
- Tokenize
- Lemmatization
- Remove custom stop words

In [4]:
# Custom stop words
custom_stopwords_file ='../lookups/custom_stopwords.txt'
custom_stopwords_df = pd.read_csv(custom_stopwords_file, header=None)
print("Shape:",custom_stopwords_df.shape)
custom_stopwords = custom_stopwords_df[0].tolist()

Shape: (1138, 1)


In [5]:
# Utilities to perfrom data cleaning and preparation

nlp = spacy.load('en', disable=['parser', 'ner'])

# function to remove stopwords
def remove_stopwords(rev):
    rev_new = " ".join([i for i in rev if i not in stop_words])
    return rev_new

def lemmatization(texts, tags=['NOUN', 'ADJ']):
    output = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        output.append([token.lemma_ for token in doc if token.pos_ in tags])
    return output

# function to remove custom stopwords
def remove_custom_stopwords(texts):
    output = []
    for sent in texts:
        output.append([word for word in sent if word not in custom_stopwords])
    return output

In [6]:
# Merge title with content
source_data['text'] = source_data['article_title'] + " " + source_data["article_content"]

# Convert column to str
source_data['text'] = source_data['text'].apply(str)

# Replace line breaks
article_text = source_data['text'].str.replace("\n", " ")

# remove unwanted characters, numbers and symbols
article_text = article_text.str.replace("[^a-zA-Z#]", " ")

# remove short words (length < 3)
article_text = article_text.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

# make entire text lowercase
article_text = [r.lower() for r in article_text]

# remove stopwords from the text
article_text = [remove_stopwords(r.split()) for r in article_text]

# Tokenize
tokenized_text = pd.Series(article_text).apply(lambda x: x.split())
# Lemmatize
tokenized_text = lemmatization(tokenized_text)
# Remove custom stopwords
tokenized_text = remove_custom_stopwords(tokenized_text)

flattened_text = []
for i in range(len(tokenized_text)):
    flattened_text.append(' '.join(tokenized_text[i]))

source_data['text'] = flattened_text

# Update word count
source_data['word_count'] = source_data['text'].str.split().str.len()

# Remove word count < 15
source_data = source_data[source_data['word_count'] > 14]
source_data = source_data.reset_index()

In [7]:
print("Shape:",source_data.shape)
source_data.head()

Shape: (7081, 7)


Unnamed: 0,index,article_date,article_title,article_content,article_url,word_count,text
0,0,2019-09-14,Air Ambulances Woo Rural Consumers With Member...,Visitors and park rangers at historic Fort Sco...,https://www.npr.org/sections/health-shots/2019...,174,ambulance rural membership visitor helicopter ...
1,1,2019-09-09,Esketamine Nasal Spray Eases Depression Sympto...,"The depression drug esketamine, marketed as Sp...",https://www.npr.org/sections/health-shots/2019...,61,spray depression symptom suicidal depression d...
2,2,2019-09-16,'Tip Of The Iceberg' ‚Äî 1 In 16 Women Reports...,A survey of women ages 18 to 44 found that for...,https://www.npr.org/sections/health-shots/2019...,105,sexual encounter rape sexual encounter sexual ...
3,3,2019-09-13,How A Proposed 3-Digit Suicide Hotline Could H...,"With suicides on the rise, the government want...",https://www.npr.org/sections/health-shots/2019...,96,digit suicide hotline suicide crisis hotline d...
4,4,2019-09-12,Untreated Hearing Loss Linked To Loneliness An...,Untreated hearing loss increases the risks of ...,https://www.npr.org/sections/health-shots/2019...,163,untreated loneliness isolation isolation isola...


In [8]:
def word_counts(all_words,cutoff=2):
    #all_words = ' '.join([text for text in x])
    all_words = all_words.split()
    fdist = FreqDist(all_words)
    
    df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})
    df = df[df['count'] > cutoff]
    df = df.reset_index()
    
    frequency_table = {}
    for index, row in df.iterrows():
        frequency_table[row['word']] = row['count']
    
    return frequency_table

In [9]:
frequency_table = word_counts(source_data["text"].loc[0],cutoff=2)
frequency_table

{'ambulance': 11,
 'rural': 4,
 'membership': 18,
 'helicopter': 10,
 'town': 4,
 'air': 16,
 'subscription': 5,
 'insurance': 8,
 'customer': 3}

In [16]:
def evaluate_blog_results(row,frequency_table):
    text = row["text"]
    for keyword in frequency_table:
        text = text.replace(' '+keyword+' ', ' \x1b[1;03;31;46m'+ keyword + '\x1b[0m ')
    
    print(text)

In [20]:
# View some results
frequency_table = word_counts(source_data["text"].loc[1],cutoff=2)
evaluate_blog_results(source_data.loc[1],frequency_table)

spray [1;03;31;46mdepression[0m [1;03;31;46msymptom[0m [1;03;31;46msuicidal[0m [1;03;31;46mdepression[0m [1;03;31;46mdrug[0m [1;03;31;46mesketamine[0m spravato quick relief [1;03;31;46msuicide[0m [1;03;31;46mesketamine[0m [1;03;31;46mdepression[0m [1;03;31;46msymptom[0m [1;03;31;46msuicidal[0m [1;03;31;46mdrug[0m drug [1;03;31;46mdepressed[0m [1;03;31;46mesketamine[0m [1;03;31;46msuicidal[0m few [1;03;31;46msymptom[0m [1;03;31;46mdepression[0m [1;03;31;46mdrug[0m neuropsychopharmacology [1;03;31;46msuicide[0m crisis [1;03;31;46mesketamine[0m population clinical trial [1;03;31;46mdepressed[0m [1;03;31;46mdrug[0m [1;03;31;46mdepression[0m [1;03;31;46mdrug[0m [1;03;31;46mdepressed[0m envision [1;03;31;46msuicidal[0m [1;03;31;46mdrug[0m oral antidepressant hough [1;03;31;46mesketamine[0m [1;03;31;46mdepression[0m [1;03;31;46mdrug[0m [1;03;31;46mdepression[0m [1;03;31;46mdrug[0m [1;03;31;46msuicidal[0m [1;03;31;46msuicide[0

In [21]:
# View some results
frequency_table = word_counts(source_data["text"].loc[3],cutoff=2)
evaluate_blog_results(source_data.loc[3],frequency_table)

digit [1;03;31;46msuicide[0m [1;03;31;46mhotline[0m [1;03;31;46msuicide[0m [1;03;31;46mcrisis[0m [1;03;31;46mhotline[0m [1;03;31;46mdigit[0m [1;03;31;46msuicide[0m [1;03;31;46mcrisis[0m [1;03;31;46mhotline[0m [1;03;31;46mdigit[0m digit [1;03;31;46mhotline[0m [1;03;31;46mmental[0m [1;03;31;46mcrisis[0m [1;03;31;46mdigit[0m [1;03;31;46mmental[0m advocate excited [1;03;31;46mcrisis[0m costly [1;03;31;46mcrisis[0m cognitive shutdown blank extreme stress [1;03;31;46mhotline[0m facilitate dire [1;03;31;46msuicide[0m suicidal lifeline [1;03;31;46mdigit[0m [1;03;31;46mhotline[0m brain fcc congressional abuse affair lifeline [1;03;31;46mdigit[0m dialing [1;03;31;46mmental[0m behavioral brain stigma [1;03;31;46mmental[0m illness [1;03;31;46mmental[0m [1;03;31;46mcrisis[0m [1;03;31;46msuicide[0m stigma [1;03;31;46mhotline[0m normalize seek encourage suicidal extreme depression anxiety [1;03;31;46mcrisis[0m escalate overwhelmed [1;03;31;46

In [22]:
# View some results
frequency_table = word_counts(source_data["text"].loc[4],cutoff=2)
evaluate_blog_results(source_data.loc[4],frequency_table)

untreated [1;03;31;46mloneliness[0m [1;03;31;46misolation[0m isolation [1;03;31;46misolation[0m microwave appliance repair confusion frustrating struggle lonely battle expensive hassle impairment sock exercise newsletter shrivel church sermon frequent rally son endless speak reputation standoffish neighbor sit apartment horrible [1;03;31;46mloneliness[0m epidemic plaguing [1;03;31;46mhearing[0m [1;03;31;46mloneliness[0m decibel drop perception odd lonely dutch apartment [1;03;31;46mhearing[0m [1;03;31;46mloneliness[0m intensify detrimental hazardous smoking cigarette [1;03;31;46mloneliness[0m depression [1;03;31;46mdementia[0m death [1;03;31;46mhearing[0m know [1;03;31;46mhearing[0m stigma difficulty harmless routine [1;03;31;46mhearing[0m ago harm [1;03;31;46mloneliness[0m enormous consequence blood pressure elevated stress hormone immune feeling [1;03;31;46misolation[0m [1;03;31;46mdementia[0m death [1;03;31;46mdementia[0m depression surgery untangli

In [1]:
import sqlite3
sqlite3.connect('tti_data.db')

<sqlite3.Connection at 0x106721c70>