# Topic modeling with OpenStax text

OpenStax textbooks have structured title, learning objctives, indexed terms (created by humans). Thus, they provide a way to check how different topic-modeling NLP techniques perform.

Once we are able to develop good models based on these, we can test them on the MITx content.

In [1]:
import os, re, json, time
import nltk
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

#gensim
import gensim
from gensim import corpora, similarities
from gensim.utils import simple_preprocess
from gensim.models import LdaModel, CoherenceModel, phrases

# spacy for lemmatization
#import spacy

#beautifulsoup
from bs4 import BeautifulSoup

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# NLTK stuff
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

from nltk.util import ngrams

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
custom_stop_words = ['km', 'cm', 'kg', 'meter', 'must', 'would', 'problem', 'explain', 'solve', 'used', 'use',
                    'using']
stop_words.extend(custom_stop_words)
stop_words = list(set(stop_words))

### Parameters + cleaning

Put the parameters for home directory and where the cleaned up text files should go

In [3]:
#html file path and clean text output path:
home_dir = 'openstax'
inp_path = 'univ_phys_vol1'
oup_path = 'clean_texts'

'''
#Get all the directory names:
dir_list_ = os.listdir('{}/{}'.format(home_dir, inp_path))

#Get only directories -- for example, remove cases like '.DS_store'
dir_list1 = [dir_ for dir_ in dir_list_ if os.path.isdir('{}/{}/{}'.format(home_dir, inp_path, dir_)) ]

dir_list = [ dir_ for dir_ in dir_list1 if len(dir_.split('.'))==1 ]
len(dir_list)
#file_ = '{}m58375/index.cnxml.html'.format(path_)

#the dataframe definitions:
df_cols = ['dir_','article_id', 'title', 'lrn_obj', 'index_terms', 'text']
df_rows = []


for dir_ in dir_list:
    #print(dir_)
    #specify file to read
    file_ = '{}/{}/{}/index.cnxml.html'.format(home_dir, inp_path, dir_)
    
    #Get the 'soup'
    soup = BeautifulSoup(open(file_), "html.parser")

    #get article title, create id etc.
    article_id = soup.find_all('title')[0].text.lower().replace(' ', '_')
    title = soup.find_all('title')[0].text.lower()
    #print(article_id, title)    
    
    #Get learning objectives:
    abstract = soup.find_all('div', {'data-type' : 'abstract'})
    try:
        los = [elem.text for elem in abstract[0].find_all('li')]
        #print(los)
    except Exception as e:
        los = []

    #get index terms:
    terms = [term.text for term in soup.find_all('span', {'data-type' : 'term'}) ]    
    
    #get all text:
    #all_text = [ p_.text for p_ in soup.find_all('p') ]
    #alltext = ' '.join(all_text[:]).replace('[link]','')
    #alltext = soup.text.replace('[link]','')
    #alltext = alltext.replace('\n', ' ')
    omit_text = [elem.text for elem in soup.find_all(['mi','a', 'mn', 'mtext', 'mo'])]
    omit_text.append('\n')
    alltext = ' '.join([elem for elem in soup.find_all(text=True) if elem not in omit_text ][:])
    alltext = re.sub("\\d", " ", alltext)
   

    #Exclue 'introduction' chapters, create the row for the dataframe, and output just the text:
    with open('{}/{}/{}.txt'.format(home_dir, oup_path, dir_),'w+') as ofile:
        ofile.write(alltext)
        
    if title!='introduction':
        df_rows.append([dir_, article_id, title, los, terms, alltext])
    

#create the dataframe:
corp_df = pd.DataFrame.from_records(df_rows, columns=df_cols)

#Write to the csv file:
corp_df.to_csv('{}/phys_text_df.csv'.format(home_dir), index=None)
'''
#Read from the csv file:
corp_df = pd.read_csv('{}/phys_text_df.csv'.format(home_dir))
corp_df.tail()


Unnamed: 0,dir_,article_id,title,lrn_obj,index_terms,text
102,m58310,power,power,['Relate the work done during a time interval ...,"['average power', 'instantaneous power', 'power']","Power Power By the end of this section, you wi..."
103,m58343,elasticity_and_plasticity,elasticity and plasticity,['Explain the limit where a deformation of mat...,"['elastic', 'elastic limit', 'proportionality'...",Elasticity and Plasticity Elasticity and Plast...
104,m58375,sound_waves,sound waves,['Explain the difference between sound and hea...,"['sound', 'Hearing', 'compressions', 'rarefact...",Sound Waves Sound Waves By the end of this sec...
105,m58381,the_doppler_effect,the doppler effect,['Explain the change in observed frequency as ...,"['Doppler effect', 'Doppler shift', 'red shift']",The Doppler Effect The Doppler Effect By the e...
106,m58372,interference_of_waves,interference of waves,['Explain how mechanical waves are reflected a...,"['fixed boundary condition', 'free boundary co...",Interference of Waves Interference of Waves By...


### Get a list of all index terms

In [4]:
#Get all index terms
all_terms_list = corp_df.index_terms.tolist()
all_terms = []

for item in all_terms_list:
    all_terms.extend(item)
    
#all_terms

## Text pre-processing:

This would require us to 
1. tokenize
2. remove stop words
3. lemmatize

In the second iteration, we'll also use ngrams

### Create tokens to train the LDA model:

* We try bigrams, trigrams, and optionally, single-word tokens. Including single words seems to weaken the model. But by excluding them we are leaving out a lot of important keywords.

* It also seems important to take the token frequency into account...

In [5]:
#tokenize alt:

def tokenize_with_ngrams(text):
    #create ngrams
    bigrams = [ item[0]+'_'+item[1] for item in ngrams(text,2)]
    #bigrams

    trigrams = [ item[0]+'_'+item[1]+'_'+item[2] for item in ngrams(text,3)]
    #trigrams

    
    fdist2 = nltk.FreqDist(bigrams)
    fdist3 = nltk.FreqDist(trigrams)  
    
    all_ngrams = []    
    
    all_ngrams.extend([ elem for elem in fdist3.most_common() if elem[1]>2 ]) #3grams    
    all_ngrams.extend([ elem for elem in fdist2.most_common() if elem[1]>2 ]) #2grams        
    
    #print(text)
    #Now try replacing the text with the substrings:
    
    joined_text = '_'.join(text[:])
    #joined_text = ' '.join(text[:])
    
    corp_ng = []
    for elem in all_ngrams: #starting with the 4grams, work down.max()
        #joined_text = joined_text.replace(' '.join(elem[0].split('_')), '') #modify the 
        #text to remove the ngrams

        freq = joined_text.count(elem[0]) #count how many occurrences of the ngram exist,
        if freq > 0:
            corp_ng.append((elem[0], freq)) #count how many times replaced
            joined_text = joined_text.replace('_'+elem[0], '') #modify the text to remove the ngrams

    #Find max_freq of all ngrams:
    #print(all_ngrams)
    if len(all_ngrams)>0:
        max_freq = np.array([elem[1] for elem in all_ngrams]).max()
    else:
        max_freq = 0
    #print('max_freq', max_freq)
    #get the tokes from ngrams:
    tokens = []
    for item in corp_ng:
        tokens.extend(item[1]*[item[0]])
        
    ####################################################    
    #Uncomment the section below to include single words

    #'''
    single_words_ = joined_text.split('_') #Finally, split the remaining text at '_' to get individual words
    fdist1 = nltk.FreqDist(single_words_)
    for elem in fdist1.most_common():
        if elem[1]>5:
            corp_ng.append(elem)
            mul_fac = max_freq if elem[1] > max_freq else elem[1]
            tokens.extend(mul_fac*[elem[0]])
            
    #''' 
    ####################################################    
    
    #if considering only unique tokens
    return list(set(tokens))# corp_ng, joined_text

    #if considering token frequency
    #return tokens#, corp_ng, joined_text


def lemmatize_input(text):
    return [ lem.lemmatize(w) for w in text ] #This is just using NLTK lemamtization: change to spacy later?


def remove_stopwords(text):
    return [ word for word in text if word not in stop_words ]


def tokenize_input(text):
    return gensim.utils.simple_preprocess(str(text), deacc=True) #tokenize and remove punctuations:


def all_preproc(text):
    return tokenize_with_ngrams(lemmatize_input(remove_stopwords(tokenize_input(text))))

In [6]:
corp_df['tokenized'] = corp_df['text'].apply(all_preproc)# + corp_df['text'].apply(all_preproc)
'''
corp_df['token_gram432'] = corp_df.apply(lambda row: row['tokenized'][0], axis=1) #only n-gram tokens
corp_df['token_gram1'] = corp_df.apply(lambda row: row['tokenized'][1], axis=1) #only single-word tokens
corp_df['token_all'] = corp_df.apply(lambda row: row['token_gram432'] + row['token_gram1'], axis=1) #all tokens as a single list
'''
corp_df.tail()

Unnamed: 0,dir_,article_id,title,lrn_obj,index_terms,text,tokenized
102,m58310,power,power,['Relate the work done during a time interval ...,"['average power', 'instantaneous power', 'power']","Power Power By the end of this section, you wi...","[one_known_constant, energy, constant_speed, k..."
103,m58343,elasticity_and_plasticity,elasticity and plasticity,['Explain the limit where a deformation of mat...,"['elastic', 'elastic limit', 'proportionality'...",Elasticity and Plasticity Elasticity and Plast...,"[coefficient_static_friction, cross_sectional_..."
104,m58375,sound_waves,sound waves,['Explain the difference between sound and hea...,"['sound', 'Hearing', 'compressions', 'rarefact...",Sound Waves Sound Waves By the end of this sec...,"[air_molecule_oscillate, produce_sound_wave, e..."
105,m58381,the_doppler_effect,the doppler effect,['Explain the change in observed frequency as ...,"['Doppler effect', 'Doppler shift', 'red shift']",The Doppler Effect The Doppler Effect By the e...,"[toward_source, constant_speed, source_sends_s..."
106,m58372,interference_of_waves,interference of waves,['Explain how mechanical waves are reflected a...,"['fixed boundary condition', 'free boundary co...",Interference of Waves Interference of Waves By...,"[phase_shift, incident_wave, two_wave, cannot_..."


In [28]:
#Look at the individual words
#corp_df.iat[103,9]
corp_df['tokenized'][0]#[1000:]

['constant',
 'axis_rotation',
 'vector_sum',
 'individual_angular_momentum',
 'make',
 'derivative_angular_momentum',
 'meteor',
 'instant',
 'angular_momentum_rigid',
 'magnitude_angular_momentum',
 'point_particle',
 'take_time',
 'vector_notation',
 'force',
 'direction_angular_momentum',
 'angular_velocity',
 'particle_position',
 'angular_momentum_vector',
 'cross_product',
 'rate_change',
 'perpendicular_plane',
 'since',
 'point',
 'momentum_vector',
 'lever_arm',
 'straight_line',
 'robot_arm_mar',
 'particle',
 'position_vector',
 'angular_momentum_particle',
 'meteor_origin',
 'radius_vector',
 'mar_rock',
 'shown',
 'arm',
 'along_path',
 'chosen_origin',
 'angular_momentum_mass',
 'system',
 'forceps',
 'arm_rotating',
 'origin',
 'radius',
 'along_axis',
 'angular_momentum_zero',
 'angular_momentum',
 'expression_angular',
 'origin_particle',
 'torque_particle',
 'particle_particle',
 'angular_momentum_robot',
 'particle_designated_origin',
 'three_particle',
 'time_deriv

### Get word distribution

In [9]:
all_tokens_list = corp_df.tokenized.tolist()
all_words = []
for tokens in all_tokens_list:
    all_words.extend(tokens)

len(set(all_words)) #11968 unique "words", 134971 total "words" (includes ngrams)
fdist = nltk.FreqDist(all_words)


### Train the LDA model

In [34]:
#Other stuff not used now...
#for item in fdist:
#    print(item, fdist[item])
#top_words = fdist.most_common() #this would include all words distribution:
#top_words[:100]
#dictionary = corpora.Dictionary(corp_df['tokenize'])

In [29]:
def train_lda(df, num_topics=20):
    """
    This function trains the lda model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    We also do 2 passes of the data since this is a small dataset, so we want the distributions to stabilize
    """
    num_topics = num_topics
    chunksize = 300
    qty = 'tokenized'
    dictionary = corpora.Dictionary(df[qty])
    corpus = [dictionary.doc2bow(doc) for doc in df[qty]]
    
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,
                   alpha='auto', eta='auto', chunksize=chunksize, minimum_probability=0.01, passes=10, 
                  random_state=1)
    t2 = time.time()
    #print("Time to train LDA model on ", len(df), "articles: ", (t2-t1)/60, "min")
    return dictionary,corpus,lda

#### CAUTION

The cell below takes ~10 minutes to run.

In [11]:
'''
lda_metrics  = []
lda_cols = ['num_topics', 'perplexity_score', 'coherence_score']
for i in range(0, 9):
    num_ = 10+i*5
    dictionary,corpus,lda_model = train_lda(corp_df, num_topics=num_)
    # Compute Perplexity
    #print('\nPerplexity: ', lda_model.log_perplexity(corpus)) 
    #a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=corp_df[qty], 
                                         dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    #print('\nCoherence Score: ', coherence_lda)
    lda_metrics.append([num_, lda_model.log_perplexity(corpus), coherence_lda])
    
lda_metrics_df = pd.DataFrame.from_records(lda_metrics, columns=lda_cols)
lda_metrics_df
'''

"\nlda_metrics  = []\nlda_cols = ['num_topics', 'perplexity_score', 'coherence_score']\nfor i in range(0, 9):\n    num_ = 10+i*5\n    dictionary,corpus,lda_model = train_lda(corp_df, num_topics=num_)\n    # Compute Perplexity\n    #print('\nPerplexity: ', lda_model.log_perplexity(corpus)) \n    #a measure of how good the model is. lower the better.\n\n    # Compute Coherence Score\n    coherence_model_lda = CoherenceModel(model=lda_model, texts=corp_df[qty], \n                                         dictionary=dictionary, coherence='c_v')\n    coherence_lda = coherence_model_lda.get_coherence()\n    #print('\nCoherence Score: ', coherence_lda)\n    lda_metrics.append([num_, lda_model.log_perplexity(corpus), coherence_lda])\n    \nlda_metrics_df = pd.DataFrame.from_records(lda_metrics, columns=lda_cols)\nlda_metrics_df\n"

In [30]:
#Check with 50 topics:

num_ = 50
dictionary,corpus,lda_model = train_lda(corp_df, num_topics=num_)
# Compute Perplexity
#print('\nPerplexity: ', lda_model.log_perplexity(corpus)) 
#a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=corp_df['tokenized'], 
                                     dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
#print('\nCoherence Score: ', coherence_lda)
#lda_metrics.append([num_, lda_model.log_perplexity(corpus), coherence_lda])


In [21]:
'''
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=corp_df['tokenized'], 
                                     dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
'''

"\n# Compute Perplexity\nprint('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.\n\n# Compute Coherence Score\ncoherence_model_lda = CoherenceModel(model=lda_model, texts=corp_df['tokenized'], \n                                     dictionary=dictionary, coherence='c_v')\ncoherence_lda = coherence_model_lda.get_coherence()\nprint('\nCoherence Score: ', coherence_lda)\n"

In [31]:
print(num_, lda_model.log_perplexity(corpus), coherence_lda)

50 -9.158577383450593 0.49671109497918503


In [32]:
# show_topics method shows the the top num_words contributing to num_topics number of random topics
#lda.show_topics()
#for item in lda[corpus]:
#    print(item)
lda_model.show_topics()

[(15,
  '0.000*"two" + 0.000*"force" + 0.000*"example" + 0.000*"object" + 0.000*"weight" + 0.000*"acting_force" + 0.000*"one" + 0.000*"si_unit" + 0.000*"tension" + 0.000*"long"'),
 (38,
  '0.000*"car" + 0.000*"velocity" + 0.000*"speed" + 0.000*"first" + 0.000*"system" + 0.000*"solution" + 0.000*"moving" + 0.000*"momentum_kinetic_energy" + 0.000*"perfectly_inelastic" + 0.000*"strategy"'),
 (2,
  '0.000*"right" + 0.000*"two" + 0.000*"given" + 0.000*"point" + 0.000*"direction" + 0.000*"straight" + 0.000*"angle" + 0.000*"vector_three" + 0.000*"component_along" + 0.000*"unit_vector_direction"'),
 (28,
  '0.000*"two" + 0.000*"force" + 0.000*"point" + 0.000*"equation" + 0.000*"distance" + 0.000*"given" + 0.000*"one" + 0.000*"called" + 0.000*"move" + 0.000*"mass"'),
 (12,
  '0.000*"two" + 0.000*"equal" + 0.000*"time" + 0.000*"equation" + 0.000*"speed" + 0.000*"velocity" + 0.000*"constant" + 0.000*"height" + 0.000*"positive_direction" + 0.000*"along"'),
 (37,
  '0.010*"force" + 0.009*"spring" +

In [228]:
#print(num_, lda_model.log_perplexity(corpus), coherence_lda)

In [33]:
#lda_model.get_topics()
#print(lda_model.get_topic_terms(13, topn=10))
#word_id = 2475
topic_rows = []
topic_cols = ['topic_num']
topic_cols.extend(['word_'+str(j) for j in range(1,11)])
for i in range (0, 50):
    row = [i]
    for item in lda_model.get_topic_terms(i, topn=10):
        if item[1] >= 0.001:
            row.append(dictionary[item[0]])
            
        else:
            row.append('')
        
    topic_rows.append(row)
#print(word_id, dictionary[word_id])

topic_df = pd.DataFrame.from_records(topic_rows, columns=topic_cols)

topic_df


Unnamed: 0,topic_num,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9,word_10
0,0,check_understanding,change,conservation_energy,strategy,calculate,gravitational_force,lowest_point,bar_graph,non_conservative,form
1,1,distance,angle,rest,weight,force_mass,body,acceleration,find,object,mass
2,2,,,,,,,,,,
3,3,final_velocity,displacement,initial_velocity,kinematic_equation,solution,constant_velocity,zero,find,object,acceleration
4,4,,,,,,,,,,
5,5,speed,velocity,motion,two,moving,system,car,horizontal,mass,different
6,6,mass,disk,radius,hinge,axis_rotation,torque,axis,pivot_point,lever_arm,net_torque
7,7,force,mass,body,example,force_due_pressure,come,order_magnitude_round,physic_also,theory,model_theory_law
8,8,kinetic_energy,speed,potential_energy,energy,velocity,change,time,much,work_done,height
9,9,time,object,example,two,one,energy,amplitude,motion,constant,point


In [300]:
for item in lda_model.get_topic_terms(25, topn=20):
    print(dictionary[item[0]], item[1])

moment_inertia 0.06209502
rotational_kinetic_energy 0.037946682
angular_velocity 0.036025878
angular_momentum 0.02350095
kinetic_energy 0.020178655
axis_rotation 0.018705424
semi_major_axis 0.01777207
mass_radius 0.016993541
work_energy_theorem 0.013129778
center_mass 0.011385562
merry_go_round 0.01119167
rigid_body 0.011134104
parallel_axis_theorem 0.010542738
fixed_axis 0.0094315
find_moment_inertia 0.008929005
rotation_rate 0.008905958
rotating_rigid_body 0.008894942
rotational_translational 0.008879641
total_energy 0.008141228
circular_orbit 0.008135411


### How does the existing docs look like given these topics?

In [301]:
for i in range (0, corp_df.shape[0]):
    print('doc_{}'.format(i), lda_model.get_document_topics(corpus[i]))

doc_0 [(22, 0.99512297)]
doc_1 [(3, 0.99586076)]
doc_2 [(37, 0.9958766)]
doc_3 [(25, 0.99519163)]
doc_4 [(30, 0.8846727), (41, 0.068897896), (42, 0.043882944)]
doc_5 [(13, 0.99518085)]
doc_6 [(13, 0.9976297)]
doc_7 [(36, 0.99101627)]
doc_8 [(39, 0.9973369)]
doc_9 [(48, 0.99603665)]
doc_10 [(12, 0.99753344)]
doc_11 [(25, 0.9972491)]
doc_12 [(44, 0.99721813)]
doc_13 [(22, 0.9940072)]
doc_14 [(17, 0.9872598)]
doc_15 [(15, 0.9957393)]
doc_16 [(29, 0.9910086)]
doc_17 [(7, 0.9978736)]
doc_18 [(11, 0.994216)]
doc_19 [(4, 0.99567676)]
doc_20 [(0, 0.01978143), (1, 0.020933812), (2, 0.019241286), (3, 0.02094917), (4, 0.021934604), (5, 0.01892365), (6, 0.020615412), (7, 0.020855892), (8, 0.018214209), (9, 0.019565634), (10, 0.019618971), (11, 0.018993156), (12, 0.019755173), (13, 0.021396695), (14, 0.02117309), (15, 0.019620413), (16, 0.017440025), (17, 0.019821964), (18, 0.02103859), (19, 0.019595118), (20, 0.019006072), (21, 0.018755626), (22, 0.022173626), (23, 0.020131502), (24, 0.019200874),

### Check with other similar texts:


In [308]:
text_check_old = 'the wave speed is higher in the string with the lower linear mass density.\
Superposition and Interference Most waves do not look very simple. Complex waves are more\
interesting, even beautiful, but they look formidable. Most interesting mechanical waves \
consist of a combination of two or more traveling waves propagating in the same medium. \
The principle of superposition can be used to analyze the combination of waves. Consider \
two simple pulses of the same amplitude moving toward one another in the same medium, as shown in  . \
Eventually, the waves overlap, producing a wave that has twice the amplitude, and then continue on \
unaffected by the encounter. The pulses are said to interfere, and this phenomenon is known as  interference \
Two pulses moving toward one another experience interference. The term interference refers to what \
happens when two waves overlap. To analyze the interference of two or more waves, we use the principle of \
superposition. For mechanical waves, the principle of  superposition  states that if two or more traveling \
waves combine at the same point, the resulting position of the mass element of the medium, at that point, \
is the algebraic sum of the position due to the individual waves. This property is exhibited by many waves \
observed, such as waves on a string, sound waves, and surface water waves. Electromagnetic waves also obey the \
superposition principle, but the electric and magnetic fields of the combined wave are added instead of \
the displacement of the medium. Waves that obey the superposition principle are linear waves; waves that \
do not obey the superposition principle are said to be nonlinear waves. In this chapter, we deal with \
linear waves, in particular, sinusoidal waves. The superposition principle can be understood by \
considering the linear wave equation. In  , we defined a linear wave as a wave whose mathematical \
representation obeys the linear wave equation. For a transverse wave on a string with an elastic \
restoring force, the linear wave equation is Any wave function   where the argument of the \
function is linear   is a solution to the linear wave equation and is a linear wave function. \
If wave functions   and   are solutions to the linear wave equation, the sum of the two functions   \
is also a solution to the linear wave equation. Mechanical waves that obey superposition are normally \
restricted to waves with amplitudes that are small with respect to their wavelengths. If the amplitude \
is too large, the medium is distorted past the region where the restoring force of the medium is linear. \
Waves can interfere constructively or destructively.   shows two identical sinusoidal waves that \
arrive at the same point exactly in phase.  (a) and (b) show the two individual waves,  (c) \
shows the resultant wave that results from the algebraic sum of the two linear waves. The crests \
of the two waves are precisely aligned, as are the troughs. This superposition produces  \
constructive interference . Because the disturbances add, constructive interference produces a wave \
that has twice the amplitude of the individual waves, but has the same wavelength.  shows two \
identical waves that arrive exactly   out of phase, producing  destructive interference'

text_check = 'Because we defined force in terms of change in motion, the Second Law appears to be\
a  restatement  of  this  definition,  and  devoid  of  predictive  power  since  force  is  only determined\
by  measuring  acceleration.  What  transforms  the  Second  Law  from  just  a definition   is   the\
additional   input   that   comes   from force   laws that   are   based on experimental observations on\
the interactions between bodies. Throughout this book, we shall  investigate  these  force  laws  and  learn\
to  use  them  in  order  to  determine  the forces and  accelerations  acting  on  a body (left-hand-side\
of  Newton’s  Second  Law). When  a physical body is constrained to move along a surface, or inside\
a container (for example gas  molecules  in  a  container),  there  are constraint  forces that  are\
not  determined beforehand by any force law but are only determined by their effect on the\
motion of the body. For any given constrained motion, these constraint forces are unknown and\
must be determined  by  the  particular  motion  of  the  body  that we  are  studying,  for\
example the contact force of the surface on the body, or the force of the wall on the gas particles'

text_check2 = 'After a bus or train starts, the acceleration is often so small we can barely perceive it.\
We  are  often  startled  because  it  seems as  if  the  station  is  moving  in  the  opposite direction\
while we seem to be at rest. Newton’s First Law states that there is no physical way  to  distinguish\
between  whether  we  are  moving  or  the  station  is moving,  because there is nearly zero total force\
acting on the body. Once we reach a constant velocity, our minds  dismiss  the  idea  that  the  ground  is\
moving  backwards  because  we  think  it  is impossible, but there is no actual way for us to distinguish\
whether the train is moving or the ground is moving'

#tokens_432, tokens_single = all_preproc(corp_df.iat[21,5])
tokens = all_preproc(text_check)
new_doc_corpus = dictionary.doc2bow(tokens)
new_doc_corpus

print('new_doc:', lda_model.get_document_topics(new_doc_corpus))

new_doc: [(41, 0.7992479)]


In [92]:
# Visualize the topics
pyLDAvis.enable_notebook()
dictionary,corpus,lda_model = train_lda(corp_df, num_topics=50)
#vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
#vis

In [52]:
corpus[:1]
dictionary[1]

'along_axis_rotation'

### Next to do:

1. Improve the algorithm(s)
2. Check if there are other useful algorithms

### Stash

* The cells below may be useful later...

In [310]:
vectorizer = CountVectorizer(stop_words= stop_words, lowercase=True, min_df=0.0, max_df=0.9, 
                             ngram_range=(2, 3), max_features=100000)
X = vectorizer.fit_transform(corp_df['text'].tolist())

In [311]:
#print(vectorizer.get_feature_names()[1000:1100])
len(words_freq)

100000

In [312]:

sum_words = X.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_sorted = sorted(words_freq, key = lambda x: x[1], reverse=True)

In [314]:
words_sorted[0:100]

[('kinetic energy', 407),
 ('potential energy', 387),
 ('angular velocity', 273),
 ('work done', 240),
 ('angular momentum', 230),
 ('free body', 204),
 ('moment inertia', 202),
 ('second law', 188),
 ('center mass', 188),
 ('angular acceleration', 173),
 ('newton second', 171),
 ('newton second law', 171),
 ('body diagram', 171),
 ('free body diagram', 171),
 ('check understanding', 156),
 ('speed sound', 153),
 ('net force', 137),
 ('air resistance', 122),
 ('coordinate system', 120),
 ('axis rotation', 109),
 ('forces acting', 105),
 ('external force', 105),
 ('sound wave', 104),
 ('centripetal acceleration', 97),
 ('displacement vector', 97),
 ('constant velocity', 94),
 ('conceptual questions', 93),
 ('total energy', 93),
 ('flow rate', 90),
 ('rigid body', 88),
 ('kinetic friction', 88),
 ('normal force', 87),
 ('initial velocity', 87),
 ('sound waves', 87),
 ('equilibrium position', 86),
 ('gravitational force', 85),
 ('gravitational potential', 85),
 ('gravitational potential e