# LSI_Physics_Problems

In [39]:
import os, gensim
from gensim.models import LsiModel
from gensim import models, corpora
from gensim.utils import lemmatize
from gensim.parsing.preprocessing import remove_stopwords, stem_text

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet as wn

import numpy as np
import pandas as pd
from pprint import pprint
from urllib.request import urlopen

In [40]:
stop_words = stopwords.words('english') + ['km','m','seconds','h','hour','meter','meters','mi','mile',
                                           'miles','minute','minutes','ms','s²','________','second','sec','seconds','e','r','s',
                                           'hour','hours','cm','hr','b','min','n','nd','degree','degrees','v']

In [41]:
cor = pd.read_csv('phy_corpus.txt', sep = '\n',header = None)[0]


In [42]:
cor[:10]

0    An airplane accelerates down a runway at 3.20 ...
1    A car starts from rest and accelerates uniform...
2    Upton Chuck is riding the Giant Drop at Great ...
3    A race car accelerates uniformly from 18.5 m/s...
4    A feather is dropped on the moon from a height...
5    Rocket-powered sleds are used to test the huma...
6    A bike accelerates uniformly from rest to a sp...
7    An engineer is designing the runway for an air...
8    A car traveling at 22.4 m/s skids to a stop in...
9    A kangaroo is capable of jumping to a height o...
Name: 0, dtype: object

In [43]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma == None:
        return word
    else:
        return lemma

In [45]:
porter = PorterStemmer()

In [46]:
def iter_documents():
    """Iterate over all documents yielding a document(UTF8 - text) at a time"""
    for document in cor:
        yield gensim.utils.tokenize(document, lower = True)

In [47]:
def remove_stopwords(corpus):
    stop_ids = [corpus.dictionary.token2id[stopword] for stopword in stop_words if stopword in corpus.dictionary.token2id ]
    corpus.dictionary.filter_tokens(stop_ids)
    return corpus

In [51]:
class build_my_corpus(object):
    def __init__(self):
        self.dictionary = gensim.corpora.Dictionary(iter_documents())
        self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params

    def __iter__(self):
        for tokens in iter_documents():
            yield self.dictionary.doc2bow(tokens)

def build_corpus():
    corpus = build_my_corpus() # create a dictionary
    return corpus


def build_LSI_Model(corpus):
    model = LsiModel(corpus, id2word=corpus.dictionary,num_topics=100,decay=0.5)
    model.save('phy_corpus.txt' +'physics.model.gensim')

corpus = build_corpus()
lsi_model = build_LSI_Model(corpus)
phy_lsi_model = LsiModel.load('phy_corpus.txt' +'physics.model.gensim')
pprint(phy_lsi_model.print_topics())


[(0,
  '-0.307*"it" + -0.296*"at" + -0.278*"velocity" + -0.243*"how" + -0.239*"in" '
  '+ -0.236*"what" + -0.229*"and" + -0.227*"from" + -0.179*"ground" + '
  '-0.172*"an"'),
 (1,
  '0.418*"it" + 0.300*"how" + -0.267*"car" + 0.254*"ground" + 0.228*"ball" + '
  '-0.204*"what" + -0.184*"at" + -0.170*"km" + -0.169*"velocity" + '
  '-0.162*"for"'),
 (2,
  '-0.415*"velocity" + -0.315*"ball" + 0.258*"how" + 0.248*"in" + 0.239*"car" '
  '+ -0.209*"an" + -0.201*"initial" + -0.198*"with" + -0.167*"ground" + '
  '0.165*"km"'),
 (3,
  '0.421*"car" + -0.400*"in" + -0.316*"and" + 0.281*"at" + 0.232*"ball" + '
  '-0.213*"you" + -0.174*"average" + 0.129*"from" + -0.126*"air" + '
  '-0.119*"an"'),
 (4,
  '0.495*"speed" + -0.306*"seconds" + -0.279*"velocity" + 0.253*"ball" + '
  '-0.243*"car" + -0.198*"in" + 0.187*"an" + 0.185*"with" + 0.136*"take" + '
  '0.121*"time"'),
 (5,
  '0.400*"ball" + 0.307*"what" + 0.286*"at" + 0.284*"you" + 0.263*"in" + '
  '-0.226*"velocity" + -0.174*"and" + 0.152*"are" + -