## Poets, meet Natural Language Processing

### Extract the data

In [1]:
# Dependencies to read the SQLite database
import pandas as pd
import sqlite3
from pprint import pprint
import numpy as np

In [2]:
def create_dataframe(poet):
    """ Load the data from database into a dataframe """
    df = pd.read_sql_query(f"SELECT * FROM {poet};", conn)
    return df

In [3]:
# Connect to the poetry database
conn = sqlite3.connect("db/Poetry.db")

In [4]:
# Create a list of unique poets
poet_list = ["Frost", "Yeats", "Kipling"]

# Iterate through the list to create a list of dataframes
poems_df = [create_dataframe(poet) for poet in poet_list]

In [5]:
# Preview the dataframes
poems_df[1].head()

Unnamed: 0,index,title,link,lines,poet
0,0,Under Ben Bulben,https://www.poetryfoundation.org/poems/43298/u...,"I\n Swear by what the Sages spoke,\n Round the...",William Butler Yeats
1,1,A Coat,https://www.poetryfoundation.org/poetrymagazin...,"I made my song a coat,Covered with embroiderie...",William Butler Yeats
2,2,A Dialogue of Self and Soul,https://www.poetryfoundation.org/poems/43294/a...,IMy Soul. I summon to the winding ancient stai...,William Butler Yeats
3,3,A Drinking Song,https://www.poetryfoundation.org/poems/50337/a...,"Wine comes in at the mouth,\n And love comes i...",William Butler Yeats
4,4,A Meditation in Time of War,https://www.poetryfoundation.org/poems/57318/a...,"For one throb of the artery, ,While on that ol...",William Butler Yeats


In [6]:
# Disconnect from the poetry database
conn.close()

In [7]:
# Define a function that selects the relevant columns
def BuildDataframe(df):
    """ Create a new dataframe containing metadata """
    
    df1 = df[["title", "poet"]] # Select the relevant columns from the dataframe
    
    lines = df["lines"].values.tolist() # Convert the lines column into a list of strings
    df1["lines"] = [x.replace("\n", " ") for x in lines] # Remove special characters and white spaces 
    df1["lines"] = df1["lines"].str.lower() # Put all letters in lower case
    
    return df1

In [8]:
# For each dataframe, choose the relevant columns
poems_df1 = [BuildDataframe(df) for df in poems_df] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [9]:
# Preview each dataframe
# [0] = Frost, [1] = Yeats, [2] = Kipling
poems_df1[2].head()

Unnamed: 0,title,poet,lines
0,Prelude,Rudyard Kipling,(to departmental ditties)i have eaten your bre...
1,A General Summary,Rudyard Kipling,we are very slightly changed from the semi-ape...
2,Army Headquarters,Rudyard Kipling,old is the song that i sing old as my unpaid b...
3,"Study of an Elevation, in Indian Ink",Rudyard Kipling,this ditty is a string of lies. buthow the deu...
4,Delilah,Rudyard Kipling,"we have another viceroy now, those days are de..."


### Getting metadata about the poems

In [10]:
# Get information about publication year (various sources)
pubyear_Frost = [1913, 1916, 1928, 1914, 1916, 1916, 1923, 1923, 1928, 1923,
           1923, 1923, 1920, 1914, 1923, 1913, 1914, 1913, 1917, 1923,
           1916, 1913, 1923, 1923, 1923, 1914, 1942, 1923, 1916, 1914,
           1916, 1918, 1916, 1923, 1913, 1914, 1920]

pubyear_Yeats = [1938, 1914, 1933, 1916, 1921, 1919, 1904, 1913, 1919, 1933,
                 1932, 1889, 1916, 1898, 1927, 1938, 1904, 1916, 1921, 1915,
                 1938, 1909, 1928, 1916, 1916, 1899, 1939, 1916, 1899, 1916,
                 1899, 1917, 1892, 1914, 1917, 1889, 1921, 1889, 1899, 1892,
                 1928, 1917, 1914, 1889, 1892, 1892, 1892, 1933, 1914, 1933,
                 1917, 1914, 1933, 1912, 1919, 1935, 1917, 1914, 1934, 1934,
                 1934, 1916, 1916, 1935, 1916, 1919, 1912, 1919, 1914, 1916,
                 1916, 1912, 1919, 1916, 1916, 1914, 1912, 1934, 1914, 1912,
                 1914, 1916]

pubyear_Kipling = [1922] * 416 + [1919, 1922, 1920, 1902, 1904, 1895, 1904, 1917, 1895, 1916, 
                                  1920, 1919, 1922, 1921, 1922, 1919, 1902, 1922, 1904, 1895,
                                  1917, 1920, 1895, 1922, 1896, 1895, 1922, 1895, 1917, 1917,
                                  1920, 1915, 1922, 1922]

pubyears_list = [pubyear_Frost, pubyear_Yeats, pubyear_Kipling] 

In [27]:
len(pubyear_Kipling)

450

In [11]:
# Create lists of titles, lines, and poets for each dataframe
titles_list = []
lines_list = []
poets_list = []

for df in poems_df1:
    titles = df["title"].values.tolist()
    lines = df["lines"].values.tolist()
    poets = df["poet"].values.tolist()
    
    titles_list.append(titles)
    lines_list.append(lines)
    poets_list.append(poets)

In [12]:
# Get length of the entire poem for each poem in each dataframe
lengths_list = []
for lines in lines_list:
    poem_length = [len(line.split()) for line in lines]
    lengths_list.append(poem_length)

In [13]:
# List of unique words; how many unique words per poem?
unique_words = [[list(set(line.split())) for line in lines_list[x]] for x in range(0, len(lines_list))]
uniqueLength_list = [[len(y) for y in unique_words[x]] for x in range(0, len(unique_words))]

In [14]:
# Lexical diversity: proportion of unique words among all the words in the poem
# NB: No filtering has been done to remove stop words, punctuations, etc.
lexical_diversity_list = []
for x in range(0, len(lengths_list)):
    lex_divs = []
    for i in range(0, len(lengths_list[x])):
        lex_div = round(uniqueLength_list[x][i]/lengths_list[x][i], 4)
        lex_divs.append(lex_div)
    lexical_diversity_list.append(lex_divs)

### Transform the data

In [15]:
# Dependencies
import re, string

import nltk
# nltk.download("punkt")
# nltk.download('stopwords')
# nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#### Tokenise, Remove Stop Words, Lemmatise
Reference for lemmatisation: https://marcobonzanini.com/2015/01/26/stemming-lemmatisation-and-pos-tagging-with-python-and-nltk/

In [16]:
# Remove stop words from the list
stops = stopwords.words("english")
exclude = list(set(string.punctuation)) + ["’", "—", "‘"]

# Lemmatise the words in each list to retain their roots
lemmatiser = WordNetLemmatizer()

In [17]:
def tokeniser(poem):
    """ Processes the poem into tokens and removes stop words, numbers, and punctuations """
    words_list = []
    preprocessed_text = []
    
    words = word_tokenize(poem) # Create a list of words
    words2 = [word for word in words if word not in stops] # Filter the keywords
    words2 = [word for word in words2 if word not in exclude] # Filter out punctuations
    words3 = [lemmatiser.lemmatize(word, pos = "v") for word in words2] # Lemmatise each word
    words_list.append(words3) # Add the filtered list of words (representing each poem)
    words4 = " ".join(words3) # Convert the list of strings back to one string
    preprocessed_text.append(words4) # Add the string (representing each poem) to the list
    
    return words_list, preprocessed_text

In [18]:
# Tokenise each poem in each dataframe
tokenised_poems = [[tokeniser(poem) for poem in lines] for lines in lines_list]        

In [19]:
# Create lists of tokens and of filtered poems
df_tokens = []
df_filtered = []
for x in range(len(tokenised_poems)):
    token_lists = []
    filtered_lists = []
    
    for y in range (len(tokenised_poems[x])):
        token_list = tokenised_poems[x][y][0][0]
        token_lists.append(token_list)
        
        filtered_list = tokenised_poems[x][y][1][0]
        filtered_lists.append(filtered_list)
        
    df_tokens.append(token_lists)    
    df_filtered.append(filtered_lists) 

In [20]:
# Create a function that counts the number of words in each poem
def word_count(word_list):
    return len(word_list)

In [21]:
# Determine the length of each filtered poem
df_filtered_length = []

for df in df_tokens:
    lengths = []
    for poem in df:
        length = word_count(poem)
        lengths.append(length)
    df_filtered_length.append(lengths)    

In [23]:
# Add two new columns to each dataframe for tokens and filtered text
for x in range(len(poems_df1)):
    poems_df1[x]["tokens"] = df_tokens[x]
    poems_df1[x]["filteredPoem"] = df_filtered[x]
    poems_df1[x]["fullLength"] = lengths_list[x]
    poems_df1[x]["filteredLength"] = df_filtered_length[x]
    
# Preview the dataframe
poems_df1[0].head()

Unnamed: 0,title,poet,lines,tokens,filteredPoem,fullLength,filteredLength,publicationYear
0,October,Robert Frost,"o hushed october morning mild, thy leaves have...","[hush, october, morning, mild, thy, leave, rip...",hush october morning mild thy leave ripen fall...,128,80,1913
1,"‘Out, Out—’",Robert Frost,the buzz saw snarled and rattled in the yard a...,"[buzz, saw, snarl, rattle, yard, make, dust, d...",buzz saw snarl rattle yard make dust drop stov...,295,150,1916
2,Acquainted with the Night,Robert Frost,i have been one acquainted with the night. i h...,"[one, acquaint, night, walk, rain—and, back, r...",one acquaint night walk rain—and back rain out...,108,54,1928
3,After Apple-Picking,Robert Frost,my long two-pointed ladder's sticking through ...,"[long, two-pointed, ladder, 's, stick, tree, t...",long two-pointed ladder 's stick tree toward h...,283,142,1914
4,Birches,Robert Frost,when i see birches bend to left and right acro...,"[see, birch, bend, leave, right, across, line,...",see birch bend leave right across line straigh...,503,252,1916


In [None]:
# Number of poems
for x in range(len(poems_df1)):
    print (f"There are {poems_df1[x].shape[0]} poems written by {poet_list[x]} in the {x}th dataframe.")

In [None]:
# Longest and shortest poems
for x in range(len(poems_df1)):
    print(f"Author: {poet_list[x]}")
    for i in range(0, len(poems_df1[x])):
        if poems_df1[x]["fullLength"][i] == poems_df1[x]["fullLength"].max():
            print(f'Longest poem: {poems_df1[x]["title"][i]}; Full poem length: {poems_df1[x]["fullLength"][i]} words')
        if poems_df1[x]["fullLength"][i] == poems_df1[x]["fullLength"].min():
            print(f'Shortest poem: {poems_df1[x]["title"][i]}; Full poem length: {poems_df1[x]["fullLength"][i]} words')      
    print("=====\n")        

### Word importance
Source: https://stevenloria.com/tf-idf/

In [None]:
# Dependencies
import math
from textblob import TextBlob as tb

In [None]:
# Create a function that calculates term frequency
def tf(word, poem):
    return poem.words.count(word) / len(poem.words)

# Create a function that determines the number of documents that contain a certain word
def n_docs(word, poemlist):
    return sum(1 for poem in poemlist if word in poem.words)

# Create a function that determines the inverse document frequency (IDF)
# IDF = how common a word is among all the documents in poemlist
def idf(word, poemlist):
    return math.log(len(poemlist) / (1 + n_docs(word, poemlist)))

def tdidf(word, poem, poemlist):
    return tf(word, poem) * idf(word, poemlist)

In [None]:
# Create the poemlist from df["lines"]
poemlist = [tb(poem) for poem in df1["filteredPoem"]]
poemlist

In [None]:
# Create an empty list to be filled with text blobs from cleaning poemlist
poemlist2 = []

# Loop through the poemlist
for i in range(0, len(poemlist)):
    
    # Remove words that are shorter than 3 characters
    new_string = ' '.join([w for w in str(poemlist[i]).split() if len(w) > 3])
    
    # Replace emm dash with space
    new_string2 = new_string.replace("—", " ")
    
    # Convert string to text blob
    new_string2 = tb(new_string2)
    
    # Append the text blob to the list of text blobs
    poemlist2.append(new_string2)
    
poemlist2

In [None]:
# Calculate the most important words
impt_words = []
for i, poem in enumerate(poemlist2):
    scores = {word: tdidf(word, poem, poemlist2) for word in poem.words}
    sorted_words = sorted(scores.items(), key = lambda x: x[1], reverse = True)
    
    for word, score in sorted_words[:5]:
        impt_words.append((i, word, round(score, 5)))

In [None]:
impt_words

In [None]:
# Create a dataframe of important words per poem
df2 = pd.DataFrame(impt_words, columns = ["PoemNo", "Word", "TF-IDF"])
df2["Poet"] = "Robert Frost"
df2.head()

In [None]:
# Save df2 as a sqlite database table (for Javascript use later)
conn = sqlite3.connect("db/Poetry.db")

# Create a database table from the dataframe
df2.to_sql("tfidf", conn, if_exists = "replace", index = False)

# Preview the database table
pd.read_sql_query("select * from tfidf;", conn).head()

In [None]:
conn.close()

In [None]:
# Add titles for each poem in df2
titles = []
for i in range(0, len(df)):
    for p in df2.PoemNo:
        if i == p:
            title = df["title"][i]
            titles.append(title) 

df2["PoemTitle"] = titles

# Preview
df2.head()

In [None]:
# Group the important words by poem title
df3 = pd.DataFrame(df2.groupby(["PoemTitle", "Word"])["TF-IDF"].mean())
df3

### Visualise the important words

In [None]:
# Dependencies
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = "whitegrid")
import numpy as np

from ipywidgets import widgets, interactive

In [None]:
# Create a widget containing poem titles (sorted alphabetically)
titles = list(df.title)
titles.sort()

poem_title = widgets.Dropdown(options = ["Choose a poem..."] + titles, value = "Choose a poem...", 
                              description = "Title:", disabled = False)

In [None]:
# Create a filter based on title
def plot_it(poem_title):
    if poem_title != "Choose a poem...":
        df3 = df2[df2["PoemTitle"] == poem_title]
        
        plt.figure(figsize = (10, 6))
        sns.set(font_scale = 1.5)
        graph = sns.barplot(y = "Word", x = "TF-IDF", data = df3, palette = "Blues_d")

In [None]:
# Plot the data by poem title
interactive(plot_it, poem_title = poem_title)

### Sentiment Analysis - Metadata

In [None]:
# Predict sentiments based on textblobs
sentiment_polarity = [round(poem.sentiment.polarity, 3) \
                      for poem in poemlist2]
sentiment_cat = ["positive" if sp > 0
                 else "negative" if sp < 0
                 else "neutral"
                 for sp in sentiment_polarity]

In [None]:
metadata = pd.DataFrame({"PoemNo": index,
                         "Poet": poets,
                         "Title": poems,
                         "Content": lines,
                         "Length": poem_length,
                         "Sentiment": sentiment_cat,
                         "Pubn_Year": pubyear,
                         "Lexical_Diversity": lex_div}, 
                        columns = ["PoemNo", "Poet", "Title", "Length", 
                                   "Content", "Sentiment", "Pubn_Year", "Lexical_Diversity"])
metadata.head()

In [None]:
# Save df2 as a sqlite database table (for Javascript use later)
conn = sqlite3.connect("db/Poetry.db")

# Create a database table from the dataframe
metadata.to_sql("metadata", conn, if_exists = "replace", index = False)

# Preview the database table
pd.read_sql_query("select * from metadata;", conn).head()

### Topic Modelling
Sources: 
1. https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/
2. https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/

In [None]:
# Gensim 
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel

warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [None]:
tokens = df1["tokens"].tolist()

In [None]:
# Build the bigram and the trigram model
bigram = gensim.models.Phrases(tokens, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[tokens], threshold=100) 
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
def process_words(texts, stop_words = stops, allowed_postags = ["NOUN", "ADJ", "ADV"]):
    """ Remove stop words, create bigrams and trigrams, lemmatise """
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]    
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    
    texts_out = []
    
    nlp = spacy.load("en", disable = ["parser", "ner"])
    
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        
        # remove stop words (again)         
        texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] \
                     for doc in texts_out]
        
        # remove words shorter than three letters       
        texts_out = [[word for word in lst if len(word) > 2] for lst in texts_out]

    return texts_out

In [None]:
filtered_text = process_words(tokens)

In [None]:
filtered_text

In [None]:
# Create dictionary
id2words = corpora.Dictionary(filtered_text)

# Create corpus term frequency (convert dictionary to bag-of-words)
corpus = [id2words.doc2bow(text) for text in filtered_text]

#### How many topics?

In [None]:
# Create a range of number of topics
num_topics = list(range(1, 21))
num_topics

In [None]:
# Define a function that calculates the coherence score 
def coherence_score(num_topics):
    """ Create a LDA model """
    lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                            id2word = id2words,
                                            num_topics = num_topics,
                                            random_state = 100,
                                            update_every = 1,
                                            chunksize = 100,
                                            passes = 20,
                                            alpha = "auto",
                                            per_word_topics = True)
    
    """ Calculate the coherence score """
    coherence_model_lda = CoherenceModel(model = lda_model, 
                                         texts = filtered_text, 
                                         dictionary = id2words,
                                         coherence = 'c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    
    return coherence_lda

In [None]:
# Calculate the coherence score of each number of topics
coh_score = [coherence_score(x) for x in num_topics]
coh_score

In [None]:
# Plot number of topics vs coherence score
# Find the highest coherence score before the trend flattens out
plt.plot(num_topics, coh_score, "bo-")
plt.xlabel("Number of topics")
plt.ylabel("Coherence score")

In [None]:
# Building the LDA model using the chosen number of topics
final_number = 6

lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                            id2word = id2words,
                                            num_topics = final_number,
                                            random_state = 100,
                                            update_every = 1,
                                            chunksize = 100,
                                            passes = 20,
                                            alpha = "auto",
                                            per_word_topics = True)

# Compute Perplexity
print(f"Perplexity: {lda_model.log_perplexity(corpus)}")

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model = lda_model, 
                                     texts = filtered_text, 
                                     dictionary = id2words, 
                                     coherence = 'c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"Coherence Score: {coherence_lda}")

In [None]:
# Keywords for the top 10 topics
doc_lda = lda_model[corpus]
pprint(lda_model.print_topics())

### Most important words per topic

In [None]:
# Dependencies
import pyLDAvis
import pyLDAvis.gensim

In [None]:
# Create graphs of most important words per topic
# Based on the LDA model

pyLDAvis.enable_notebook()
panel = pyLDAvis.gensim.prepare(lda_model, corpus, id2words)
panel

In [None]:
# Save the graph as a html page
pyLDAvis.save_html(panel, "lda.html")

### Dominant Topic in each poem

In [None]:
def format_topics_sentences(doc_lda, ldamodel = lda_model, texts = tokens):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(doc_lda):
        row = row_list[0] if ldamodel.per_word_topics else row_list 
        row = sorted(row, key = lambda x: (x[1]), reverse = True)
        
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), \
                                                                  topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(doc_lda, ldamodel = lda_model, texts = tokens)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

### Data Visualisation

In [None]:
# Length of text per tokenised poem
doc_lens = [len(d) for d in df_dominant_topic.Text]

plt.figure(figsize = (10,3), dpi = 160)
plt.hist(doc_lens, bins = 1000, color='navy')
plt.text(200, 1.75, "Mean   : " + str(round(np.mean(doc_lens))))
plt.text(200, 1.60, "Median : " + str(round(np.median(doc_lens))))
plt.text(200, 1.45, "Stdev   : " + str(round(np.std(doc_lens))))
plt.text(200, 1.30, "1%ile    : " + str(round(np.quantile(doc_lens, q = 0.01))))
plt.text(200, 1.15, "99%ile  : " + str(round(np.quantile(doc_lens, q = 0.99))))

plt.gca().set(xlim = (0, 300), ylabel = 'Number of Documents', xlabel = 'Document Word Count')
plt.tick_params(size = 8)
plt.xticks(np.linspace(0, 300, 9))
plt.title('Distribution of Document Word Counts', fontdict = dict(size = 10))
plt.show()

In [None]:
import seaborn as sns
import matplotlib.colors as mcolors
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

fig, axes = plt.subplots(3,2, figsize = (16,14), dpi = 160, sharex = True, sharey = True)

for i, ax in enumerate(axes.flatten()):    
    df_dominant_topic_sub = df_dominant_topic.loc[df_dominant_topic.Dominant_Topic == i, :]
    doc_lens = [len(d) for d in df_dominant_topic_sub.Text]
    ax.hist(doc_lens, bins = 1000, color = cols[i])
    ax.tick_params(axis = 'y', labelcolor = cols[i], color = cols[i])
    sns.kdeplot(doc_lens, color = "black", shade = False, ax = ax.twinx())
    ax.set(xlim = (0, 300), xlabel = 'Document Word Count')
    ax.set_ylabel('Number of Documents', color = cols[i])
    ax.set_title('Topic: '+str(i), fontdict = dict(size = 8, color = cols[i]))

fig.tight_layout()
fig.subplots_adjust(top = 0.90)
plt.xticks(np.linspace(0, 300, 9))
fig.suptitle('Distribution of Document Word Counts by Dominant Topic', fontsize = 12)
plt.show()

In [None]:
# Wordcloud of Top N words in each topic
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords = stops,
                  background_color = 'white',
                  width = 2500,
                  height = 1800,
                  max_words = 10,
                  colormap = 'tab10',
                  color_func = lambda *args, **kwargs: cols[i],
                  prefer_horizontal = 1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(3, 2, figsize = (10,10), sharex = True, sharey = True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size = 300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size = 16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace = 0, hspace = 0)
plt.axis('off')
plt.margins(x = 0, y = 0)
plt.tight_layout()
plt.show()

In [None]:
from collections import Counter
topics = lda_model.show_topics(formatted = False)
data_flat = [w for w_list in filtered_text for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(3, 2, figsize = (16,10), sharey = True, dpi = 160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x = 'word', 
           height = "word_count", 
           data = df.loc[df.topic_id == i, :], 
           color = cols[i], 
           width = 0.5, 
           alpha = 0.3, 
           label = 'Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x = 'word', 
                height = "importance", 
                data = df.loc[df.topic_id == i, :], 
                color = cols[i], width = 0.2,
                label ='Weights')
    ax.set_ylabel('Word Count', color = cols[i])
    ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 50)
    ax.set_title('Topic: ' + str(i), color = cols[i], fontsize = 16)
    ax.tick_params(axis = 'y', left = False)
    ax.set_xticklabels(df.loc[df.topic_id == i, 'word'], 
                       rotation = 30, 
                       horizontalalignment = 'right')
    ax.legend(loc ='upper left')
    ax_twin.legend(loc = 'upper right')

fig.tight_layout(w_pad = 2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize = 14, y = 1.05)    
plt.show()

In [None]:
# Word colouring of N poems
from matplotlib.patches import Rectangle

def sentences_chart(lda_model = lda_model, corpus = corpus, start = 0, end = 38):
    corp = corpus[start:end]
    mycolors = [color for name, color in mcolors.TABLEAU_COLORS.items()]

    fig, axes = plt.subplots(end-start, 1, figsize=(20, (end-start)*0.95), dpi=160)       
    axes[0].axis('off')
    for i, ax in enumerate(axes):
        if i > 0:
            corp_cur = corp[i-1] 
            topic_percs, wordid_topics, wordid_phivalues = lda_model[corp_cur]
            word_dominanttopic = [(lda_model.id2word[wd], topic[0]) for wd, topic in wordid_topics]    
            ax.text(0.01, 0.5, "Doc " + str(i-1) + ": ", verticalalignment = 'center',
                    fontsize = 16, color = 'black', transform = ax.transAxes, fontweight = 700)

            # Draw Rectangle
            topic_percs_sorted = sorted(topic_percs, key = lambda x: (x[1]), reverse = True)
            ax.add_patch(Rectangle((0.0, 0.05), 0.99, 0.90, fill = None, alpha = 1, 
                                   color = mycolors[topic_percs_sorted[0][0]], linewidth = 2))

            word_pos = 0.06
            for j, (word, topics) in enumerate(word_dominanttopic):
                if j < 14:
                    ax.text(word_pos, 0.5, word,
                            horizontalalignment = 'left',
                            verticalalignment = 'center',
                            fontsize = 16, 
                            color = mycolors[topics],
                            transform = ax.transAxes, 
                            fontweight = 700)
                    word_pos += .009 * len(word)  # to move the word for the next iter
                    ax.axis('off')
            ax.text(word_pos, 0.5, '. . .',
                    horizontalalignment = 'left',
                    verticalalignment = 'center',
                    fontsize = 16, 
                    color = 'black',
                    transform = ax.transAxes)       

    plt.subplots_adjust(wspace = 0, hspace = 0)
    plt.suptitle('Topic Coloring for Poems: ' + str(start) + ' to ' + str(end-2), \
                 fontsize = 14, y = 0.95, fontweight = 700)
    plt.tight_layout()
    plt.show()

sentences_chart() 

In [None]:
# Define a function that identifies and quantifies the dominant topics
def topics_per_document(model, corpus, start = 0, end = 1):
    corpus_sel = corpus[start:end]
    dominant_topics = []
    topic_percentages = []
    for i, corp in enumerate(corpus_sel):
        topic_percs, wordid_topics, wordid_phivalues = model[corp]
        dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse = True)[0][0]
        dominant_topics.append((i, dominant_topic))
        topic_percentages.append(topic_percs)
    
    return(dominant_topics, topic_percentages)

In [None]:
dominant_topics, topic_percentages = topics_per_document(model = lda_model, corpus = corpus, end = -1)

In [None]:
# Distribution of Dominant Topics in Each Document
df = pd.DataFrame(dominant_topics, columns = ['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name = 'count').reset_index()

# Total Topic Distribution by actual weight
topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
df_topic_weightage_by_doc = topic_weightage_by_doc.sum().to_frame(name = 'count').reset_index()

# Top 3 Keywords for each Topic
topic_top3words = [(i, topic) for i, topics in lda_model.show_topics(formatted = False) 
                                 for j, (topic, wt) in enumerate(topics) if j < 3]

df_top3words_stacked = pd.DataFrame(topic_top3words, columns = ['topic_id', 'words'])
df_top3words = df_top3words_stacked.groupby('topic_id').agg(', \n'.join)
df_top3words.reset_index(level =0,inplace = True)

In [None]:
from matplotlib.ticker import FuncFormatter

# Plot
fig, (ax1, ax2) = plt.subplots(2, 1, figsize = (10, 4), dpi = 120, sharex = True)

# Topic Distribution by Dominant Topics
ax1.bar(x = 'Dominant_Topic', 
        height = 'count', 
        data = df_dominant_topic_in_each_doc, 
        width = .5, color = 'firebrick')
ax1.set_xticks(range(df_dominant_topic_in_each_doc.Dominant_Topic.unique().__len__()))
tick_formatter = FuncFormatter(lambda x, pos: 'Topic ' + str(x)+ '\n' + \
                               df_top3words.loc[df_top3words.topic_id == x, 'words'].values[0])
ax1.xaxis.set_major_formatter(tick_formatter)
ax1.set_title('Number of Documents by Dominant Topic', fontdict=dict(size = 10))
ax1.set_ylabel('Number of Documents')
ax1.set_ylim(0, 10)

# Topic Distribution by Topic Weights
ax2.bar(x = 'index', 
        height = 'count', 
        data = df_topic_weightage_by_doc, 
        width = .5, color = 'steelblue')
ax2.set_xticks(range(df_topic_weightage_by_doc.index.unique().__len__()))
ax2.xaxis.set_major_formatter(tick_formatter)
ax2.set_title('Number of Documents by Topic Weightage', fontdict = dict(size = 10))

plt.show()

### Create a keyword network based on term frequency and TF-IDF
(use the "to_gephi.csv" and "to_gephi2.csv" files in Gephi for visualisation)
source: https://pythondata.com/text-analytics-visualization/

In [None]:
# Define a function that extracts the most common words per poem
def get_keywords(token_list, num):
    return Counter(token_list).most_common(num)

In [None]:
# Convert the filtered poems into strings
poemlist3 = [str(poem) for poem in poemlist2]
token_list = [word_tokenize(poem) for poem in poemlist3]

In [None]:
# titles = df1["title"].values.tolist()

df4 = pd.DataFrame({"title": poems, 
                    "poet": poets,
                    "filteredPoem": poemlist3})
df4.head()

In [None]:
# Use the function to extract the top 5 words per poem
keywords = [get_keywords(tokens, 5) for tokens in token_list]

# Extract the list of keywords 
unzipped = [zip(*kw)for kw in keywords]
kw = [list(x)[0] for x in unzipped]

# Convert the list of keywords to a string
kw2 = [",".join(str(y) for y in x) for x in kw]

# Add the list of keywords to the dataframe
df4["keywords_TF"] = kw2
df4.head()

In [None]:
# Add keywords based on TF-IDF
impt_words2 = df3.reset_index().groupby("PoemTitle")["Word"].apply(list)
df4["keywords_TF-IDF"] = [",".join(str(y) for y in x) for x in impt_words2]
df4.head()

In [None]:
# Dataframe of keywords according to term frequency
keywordsTF = []
for i, r in df4.iterrows():
    keywords = r["keywords_TF"].split(",")
    for kw in keywords:
        keywordsTF.append((kw.strip(""), r["keywords_TF"]))
kwTF_df = pd.DataFrame(keywordsTF).rename(columns = {0: "keyword", 1: "keywords"})
kwTF_df.head()

In [None]:
# Dataframe of keywords according to TF-IDF
keywordsTFIDF = []
for i, r in df4.iterrows():
    keywords = r["keywords_TF-IDF"].split(",")
    for kw in keywords:
        keywordsTFIDF.append((kw.strip(""), r["keywords_TF-IDF"]))
kwTFIDF_df = pd.DataFrame(keywordsTFIDF).rename(columns = {0: "keyword", 1: "keywords"})
kwTFIDF_df.head()

#### TF

In [None]:
# Convert rows to lists
docsTF = kwTF_df["keywords"].tolist()
namesTF = kwTF_df["keyword"].tolist()

docs_list = [i.split(",")for i in docsTF]

In [None]:
# Create an ordered dictionary of keyword and frequency of co-occurrence
from collections import OrderedDict
occurrences = OrderedDict((name, OrderedDict((name, 0) for name in namesTF)) for name in namesTF)

for i in docs_list:
    for x in range(len(i)):
        for item in i[:x] + i[x + 1:]:
            occurrences[i[x]][item] += 1

# Create a dataframe of co-occurrences
co_occur_df = pd.DataFrame.from_dict(occurrences)         
co_occur_df.head()

In [None]:
co_occur_df.to_csv("to_gephi.csv", sep = ",")

#### TF-IDF

In [None]:
# Convert rows to lists
docsTFIDF = kwTFIDF_df["keywords"].tolist()
namesTFIDF = kwTFIDF_df["keyword"].tolist()

docs_list = [i.split(",")for i in docsTFIDF]

In [None]:
# Create an ordered dictionary of keyword and frequency of co-occurrence
from collections import OrderedDict
occurrences2 = OrderedDict((name, OrderedDict((name, 0) for name in namesTFIDF)) for name in namesTFIDF)

for i in docs_list:
    for x in range(len(i)):
        for item in i[:x] + i[x + 1:]:
            occurrences2[i[x]][item] += 1

# Create a dataframe of co-occurrences
co_occur_df2 = pd.DataFrame.from_dict(occurrences2)         
co_occur_df2.head()

In [None]:
co_occur_df2.to_csv("to_gephi2.csv", sep = ",")