## Recommendation System

This model uses a recommendation system to predict the 'terms' which can be used for a given proof based on the list of 'types' used in its statement.

In [1]:
# importing required libraries

import json
import random
import pandas as pd
from statistics import mean
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# Open and Read JSON file's data

file = open(r"shallow-frequencies/shallow-frequencies.json", "r", encoding='utf-8')
js = file.read()
data = json.loads(js)
file.close()

In [3]:
# Get all the proofs
proofs = data["triples"]

# Divide proofs into train & test Data 
no_test = 150    # No of elements in test data
train, test = train_test_split(proofs, test_size = (no_test/len(proofs)))

In [4]:
# Shuffle the list of all types and terms to avoid any biasedness peresent in data

statements = [row["name"] for row in data["types"]]
random.shuffle(statements)
proofs = [row["name"] for row in data["terms"]]
random.shuffle(proofs)

In [5]:
# Create a data frame with index beeing the list of terms & columns being the list of Types

df = pd.DataFrame(index = proofs)

for statement in statements:
    df[statement] = [0]*len(df)

In [6]:
# Training on the 'train' data
for val in train:
    df.loc[val["terms"],val["types"]] += 1

In [7]:
df_corr = df.corr(method='spearman')

In [8]:
# Function to return list of all 'terms' based on their decreasing order of priorities for a given list of 'types'

def get_recom(statement):
    corr = df_corr[statement]
    corr = corr.mean(axis=1)
    wght = df * corr

    wght["mean"] = wght.mean(axis=1)
    final = wght[["mean"]]
    final = final.sort_values(by=["mean"], ascending=False)

    return final.index.tolist()

In [9]:
# function to get score (average rank of terms in the recommended list/average rank of terms in the normal list)

def get_score(triple, single = True):
    if triple["terms"] == [] or triple["types"] == []:
        return "terms_or_types_is_empty"
    
    else:
        recom = get_recom(triple["types"])

        pos_recom = []
        pos_norm = []

        for term in triple["terms"]:
            pos_recom.append(recom.index(term))
            pos_norm.append(df.index.to_list().index(term))

        avg_recom = mean(pos_recom)
        avg_norm = mean(pos_norm)
        
        if single:
            return ((avg_recom+1)/(avg_norm+1))*100
        else:
            return (avg_recom, avg_norm)

In [10]:
# function to calculate the average score for more than one test data

def avg_score(triples, graph = False):
    print(f"Calculating average score for {len(triples)} elements.")
    print("element\t(avg_recom, avg_norm)")
    
    i = 0
    list_recom = []
    list_norm = []
    
    for triple in triples:
        score = get_score(triple, False)
        if score != "terms_or_types_is_empty":
            list_recom.append(score[0])
            list_norm.append(score[1])
        i+=1
        print(f" {i} : {score}")
    
    avg_per = (mean(list_recom)*100)/mean(list_norm)
    print(f" avg_score of {len(list_recom)} elements is {avg_per} %")
        
    return avg_per

In [11]:
avg_score(test)

Calculating average score for 150 elements.
element	(avg_recom, avg_norm)
 1 : terms_or_types_is_empty
 2 : terms_or_types_is_empty
 3 : (26.25, 1842.3333333333333)
 4 : terms_or_types_is_empty
 5 : (64.28571428571429, 1985.7142857142858)
 6 : terms_or_types_is_empty
 7 : terms_or_types_is_empty
 8 : (68, 1421)
 9 : (45.16260162601626, 1545.861788617886)
 10 : (33, 1654)
 11 : (53.27272727272727, 1261.7454545454545)
 12 : terms_or_types_is_empty
 13 : (24, 1756)
 14 : terms_or_types_is_empty
 15 : terms_or_types_is_empty
 16 : terms_or_types_is_empty
 17 : (3, 3298)
 18 : (22.75862068965517, 1641.1724137931035)
 19 : terms_or_types_is_empty
 20 : (70.07142857142857, 1538.4285714285713)
 21 : (103.77777777777777, 1716.875)
 22 : terms_or_types_is_empty
 23 : terms_or_types_is_empty
 24 : (24.2, 508.4)
 25 : terms_or_types_is_empty
 26 : (1071.3333333333333, 1166.3333333333333)
 27 : (859, 933)
 28 : (3134, 1643)
 29 : (15.6875, 2442.5)
 30 : (271.46666666666664, 2382.2)
 31 : (7.2857142

23.718311626898426

In [12]:
spea = [547, 441]
cosi = [707]
print(mean(spea))
print(mean(cosi))

494
707


In [13]:
spea = [612, 491, 424, 442, 348, 426, 504, 429, 443, 557, 475]
kend = [613, 471, 407, 440, 345, 425, 502, 437, 441, 572, 474]
print(mean(spea))
print(mean(kend))

468.27272727272725
466.09090909090907


In [14]:
spea - pers
355 - 381
456 - 479
517 - 534
pers = [639, 519, 451, 502]

NameError: name 'pers' is not defined

Here the average rank (x) represents that the average rank of terms present in a proof is x (in the recommended list) out of total 3337 terms present in the list.

In [None]:
# function to calculate the average score for more than one test data

def avg_score_graph(triples):
    list_recom = []
    
    for triple in triples:
        score = get_score(triple, False)
        if score != "terms_or_types_is_empty":
            list_recom.append(score)
            
    return list_recom

In [None]:
scores = avg_score_graph(test)
test_no = list(range(1,len(scores)+1))

plt.figure(figsize=[15,15])

plt.plot(test_no, scores, 'g*', label="recomm_index")
plt.axhline(y = mean(scores), color = 'g', ls = 'dotted', label="Avg_recomm_index")

#plt.axhline(y = len(data['terms'])/2, color = 'r', ls = 'dotted')

plt.axhline(y = 12, color = 'b', ls = 'dotted', label="target_index")

plt.legend(loc="upper left")

plt.xlabel('Proofs')
plt.ylabel('Index of Terms')

plt.title('Dot Plot : Red Dots')