In [1]:
# setup
import pandas as pd
from common.clean import replace
from common.heuristics import has_op, test_for_suitable, split_high_level_eqs
from common.tokenize import tokenize
df = pd.DataFrame(pd.read_csv("../data/eqs_100k.tsv", sep = "\t", header=None))
# stopwords
sw = ['\\', '\\\\']
df.columns = ["eq_id", "eq"]
df['clean'] = replace(df['eq'])
df['clean_split'] = df['clean'].apply(split_high_level_eqs)
se = df['clean_split'].apply(lambda x: len(x) > 1 if x is not None else False)
df = df[se].reset_index()
se = df['clean_split'].apply(test_for_suitable)
df['clean_split_filtered'] = se
filt = df['clean_split_filtered'].apply(lambda x: x is not None)
df = df[filt]
df['clean_split_filtered_tokenized'] = df['clean_split_filtered'].apply(lambda x: [tokenize(e, sw) for e in x])

In [5]:
df['clean_split_filtered_tokenized'].to_csv('aligned_ex.csv')

In [18]:
import pandas as pd
import ast

df = pd.read_csv('aligned_ex.csv', header=None)

df[1] = df[1].apply(ast.literal_eval)

df[1].tolist()

[[['START',
   '\\left',
   '\\langle',
   '\\frac',
   '{',
   'd',
   'C',
   '}',
   '{',
   'd',
   't',
   '}',
   '\\right',
   '\\rangle'],
  ['START',
   '\\frac',
   '{',
   'e',
   'C',
   '}',
   '{',
   '1',
   '-',
   'e',
   '{',
   '2',
   '}',
   '}',
   '\\left',
   '\\langle',
   '\\frac',
   '{',
   'd',
   'e',
   '}',
   '{',
   'd',
   't',
   '}',
   '\\right',
   '\\rangle']],
 [['START',
   '3',
   '\\widetilde',
   '{',
   '\\Gamma',
   '}',
   '{',
   '2',
   'j',
   '}',
   '-',
   '3',
   '\\Gamma',
   '{',
   '2',
   'j',
   '}'],
  ['START',
   '-',
   '3',
   'R',
   '4',
   '(',
   '0',
   ')',
   'a',
   '{',
   '2',
   'j',
   '}',
   '2',
   '-',
   '6',
   '\\int',
   '{',
   '\\gamma',
   'j',
   '}',
   '\\frac',
   '{',
   'S',
   '2',
   'R',
   '4',
   '}',
   '{',
   'r',
   '5',
   '}',
   '\\psi',
   '2',
   '\\varphi',
   '1',
   '4',
   'd',
   'w',
   '+',
   '3',
   '\\int',
   '{',
   '\\gamma',
   'j',
   '}',
   '\\frac',
   '{',
   '

In [2]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.similarities import MatrixSimilarity

dataset = df['clean_split_filtered_tokenized']
docs = [doc for sublist in dataset.tolist() for doc in sublist]
dct = Dictionary(docs)
corpus = [dct.doc2bow(line) for line in docs]


In [3]:
from gensim.models import LsiModel
lsi = LsiModel(corpus, id2word=dct, num_topics=100)
index = MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it

In [4]:
# doc = "Human computer interaction"
vec_bow = dct.doc2bow(docs[0])
vec_lsi = lsi[vec_bow] # convert the query to LSI space
# print(vec_lsi)

In [5]:
lsi.print_topics()

[(0,
  '0.616*"}" + 0.616*"{" + 0.154*"e" + 0.139*"t" + 0.136*"2" + 0.134*")" + 0.134*"(" + 0.132*"i" + 0.125*"1" + 0.105*"n"'),
 (1,
  '-0.438*"e" + -0.339*"t" + -0.311*"i" + 0.292*"}" + 0.292*"{" + -0.252*"n" + -0.220*"l" + -0.218*"r" + -0.215*"o" + -0.206*"a"'),
 (2,
  '0.392*"(" + 0.392*")" + 0.373*"2" + 0.307*"0" + 0.279*"1" + 0.263*"-" + -0.232*"e" + 0.163*"5" + -0.148*"}" + -0.147*"{"'),
 (3,
  '-0.445*")" + -0.445*"(" + 0.240*"0" + -0.213*"s" + 0.207*"4" + 0.204*"3" + 0.201*"2" + 0.198*"5" + 0.197*"7" + 0.197*"="'),
 (4,
  '0.646*"2" + -0.373*"0" + 0.299*"1" + 0.230*"+" + -0.188*"(" + -0.188*")" + 0.185*"\\frac" + -0.156*"=" + -0.130*"i" + -0.128*"l"'),
 (5,
  '0.541*"1" + 0.499*"-" + -0.409*"2" + 0.228*"+" + -0.189*"t" + -0.158*"(" + -0.157*")" + -0.157*"d" + 0.147*"a" + 0.142*"n"'),
 (6,
  '0.545*"x" + 0.388*"l" + -0.287*"0" + 0.214*"v" + 0.209*"p" + 0.208*"a" + 0.187*"b" + -0.162*"t" + 0.161*"i" + 0.152*"3"'),
 (7,
  '-0.406*"i" + 0.401*"a" + -0.340*"n" + 0.303*"z" + 0.242*"