In [1]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
import requests
from bs4 import BeautifulSoup

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def get_scripts(show,start=0,limit=None):
    url = f'https://www.springfieldspringfield.co.uk/episode_scripts.php?tv-show={show}'
    
    page = requests.get(url)
    print(page)
    bs = BeautifulSoup(page.content)
    
    episode_titles_bs = bs.find_all(attrs={'class':'season-episode-title'})
    
    hrefs = ['https://www.springfieldspringfield.co.uk/'+x['href'] for x in episode_titles_bs]
    
    if limit:
        scripts = []
        failed_hrefs = []
        for url in hrefs[start:start+limit]:
            try:
                page = requests.get(url)
                #print(page)
                bs = BeautifulSoup(page.content)

                scripts.append(bs.find(attrs={'class':'scrolling-script-container'}))
            except:
                failed_hrefs.append(url)
    else:
        scripts = []
        failed_hrefs = []
        for url in hrefs:
            try:
                page = requests.get(url)
                #print(page)
                bs = BeautifulSoup(page.content)

                scripts.append(bs.find(attrs={'class':'scrolling-script-container'}))
            except:
                failed_hrefs.append(url)
        
        
    print(failed_hrefs)
    scripts = [x.get_text() for x in scripts]
    if scripts==None:
        return 'Failure'
    else:
        scripts = [x.replace('\r',' ').replace('\n', ' ').replace('\t', ' ').strip() for x in scripts]
    
    with open(f'{show}.txt', 'w') as filehandle:
        for listitem in scripts:
            filehandle.write('%s\n' % listitem)
    
        return scripts

In [3]:
scripts = get_scripts('cheers')
type(scripts)

<Response [200]>
[]


list

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [5]:
data_words = [x for x in list(sent_to_words(scripts))]

In [13]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['PROPN'])

In [14]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)

In [15]:
# Define Search Param
search_params = {'n_components': [5, 6, 7], 
                 'learning_decay': [.7, .8, .9],
                 'max_iter':[10,20,30],
                 'n_jobs':[-1],}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='batch',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                   

In [16]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.7, 'max_iter': 30, 'n_components': 5, 'n_jobs': -1}
Best Log Likelihood Score:  -35262.595759908356
Model Perplexity:  40.65193140401971


In [17]:
# Create Document - Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(scripts))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,dominant_topic
Doc0,0,0.0,0.0,0.99,0.0,3
Doc1,0,0.0,0.0,0.99,0.0,3
Doc2,0,0.0,0.0,0.99,0.0,3
Doc3,0,0.0,0.36,0.63,0.0,3
Doc4,0,0.0,0.0,0.99,0.0,3
Doc5,0,0.41,0.0,0.58,0.0,3
Doc6,0,0.0,0.18,0.25,0.57,4
Doc7,0,0.0,0.39,0.61,0.0,3
Doc8,0,0.0,0.08,0.91,0.0,3
Doc9,0,0.0,0.0,0.99,0.0,3


In [18]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,3,145
1,1,56
2,4,24
3,0,24
4,2,15


In [19]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
panel


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
