In [1]:
# setup
import pandas as pd
from common.clean import replace
from common.heuristics import has_op, test_for_suitable, split_high_level_eqs
from common.tokenize import tokenize

In [2]:
# setup
df = pd.DataFrame(pd.read_csv("../data/eqs_100k.tsv", sep = "\t", header=None))
df.columns = ["eq_id", "eq"]
df['clean'] = replace(df['eq'])
df['clean_split'] = df['clean'].apply(split_high_level_eqs)
se = df['clean_split'].apply(lambda x: len(x) > 1 if x is not None else False)
df = df[se].reset_index()
se = df['clean_split'].apply(test_for_suitable)
df['clean_split_filtered'] = se
filt = df['clean_split_filtered'].apply(lambda x: x is not None)
df = df[filt]
df['clean_split_filtered_tokenized'] = df['clean_split_filtered'].apply(lambda x: [tokenize(e) for e in x])

TypeError: tokenize() missing 1 required positional argument: 'stop_words'

In [None]:
df

In [6]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.similarities import MatrixSimilarity

In [4]:
dataset = df['clean_split_filtered_tokenized']
docs = [doc for sublist in dataset.tolist() for doc in sublist]
dct = Dictionary(docs)
corpus = [dct.doc2bow(line) for line in docs]
model = TfidfModel(corpus)

In [7]:
index = MatrixSimilarity(model[corpus], num_features=len(dct))

In [16]:
line = docs[2]
sims = index[model[dct.doc2bow(line)]]
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [18]:
docs[0:5]

[['START',
  '\\',
  '\\left',
  '\\',
  '\\langle',
  '\\',
  '\\frac',
  '{',
  'd',
  'C',
  '}',
  '{',
  'd',
  't',
  '}',
  '\\',
  '\\right',
  '\\',
  '\\rangle'],
 ['START',
  '\\',
  '\\frac',
  '{',
  'e',
  'C',
  '}',
  '{',
  '1',
  '-',
  'e',
  '{',
  '2',
  '}',
  '}',
  '\\',
  '\\left',
  '\\',
  '\\langle',
  '\\',
  '\\frac',
  '{',
  'd',
  'e',
  '}',
  '{',
  'd',
  't',
  '}',
  '\\',
  '\\right',
  '\\',
  '\\rangle'],
 ['START',
  '3',
  '\\',
  '\\widetilde',
  '{',
  '\\',
  '\\Gamma',
  '}',
  '{',
  '2',
  'j',
  '}',
  '-',
  '3',
  '\\',
  '\\Gamma',
  '{',
  '2',
  'j',
  '}'],
 ['START',
  '-',
  '3',
  'R',
  '4',
  '(',
  '0',
  ')',
  'a',
  '{',
  '2',
  'j',
  '}',
  '2',
  '-',
  '6',
  '\\',
  '\\int',
  '{',
  '\\',
  '\\gamma',
  'j',
  '}',
  '\\',
  '\\frac',
  '{',
  'S',
  '2',
  'R',
  '4',
  '}',
  '{',
  'r',
  '5',
  '}',
  '\\',
  '\\',
  '\\',
  '\\psi',
  '2',
  '\\',
  '\\varphi',
  '1',
  '4',
  '\\',
  '\\',
  'd',
  'w',
  '+'

In [17]:
print(list(enumerate(sims)))

[(0, (2, 1.0)), (1, (6181, 0.67327696)), (2, (2693, 0.65846336)), (3, (8620, 0.6564219)), (4, (3139, 0.6520169)), (5, (6505, 0.6395116)), (6, (3595, 0.63210994)), (7, (3852, 0.6096728)), (8, (4941, 0.6073731)), (9, (3939, 0.6039743)), (10, (3594, 0.5991701)), (11, (6862, 0.5831288)), (12, (8815, 0.58305347)), (13, (9613, 0.57734495)), (14, (6863, 0.57681876)), (15, (1356, 0.5660869)), (16, (8510, 0.56518775)), (17, (265, 0.56207573)), (18, (1449, 0.5605066)), (19, (5880, 0.5591963)), (20, (7398, 0.55232435)), (21, (11729, 0.54366755)), (22, (1357, 0.5399273)), (23, (1451, 0.5343574)), (24, (1854, 0.5314324)), (25, (5482, 0.52769166)), (26, (1676, 0.5271374)), (27, (10068, 0.52488273)), (28, (1271, 0.5229961)), (29, (3259, 0.51992965)), (30, (6604, 0.49513906)), (31, (6586, 0.4935655)), (32, (6864, 0.48940217)), (33, (8317, 0.48689368)), (34, (5748, 0.48457232)), (35, (6861, 0.4839781)), (36, (4550, 0.48112658)), (37, (10618, 0.48007697)), (38, (12367, 0.47597665)), (39, (3485, 0.474768