## Recommendation System

This model uses a recommendation system to predict the 'terms' which can be used for a given proof based on the list of 'types' used in its statement.

In [1]:
# importing required libraries

import json
import random
import pandas as pd
from statistics import mean
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# Open and Read JSON file's data

file = open(r"shallow-frequencies/shallow-frequencies.json", "r", encoding='utf-8')
js = file.read()
data = json.loads(js)
file.close()

In [3]:
# Get all the proofs
proofs = data["triples"]

# Divide proofs into train & test Data 
no_test = 150    # No of elements in test data
train, test = train_test_split(proofs, test_size = (no_test/len(proofs)))

In [4]:
# Shuffle the list of all types and terms to avoid any biasedness peresent in data

statements = [row["name"] for row in data["types"]]
random.shuffle(statements)
proofs = [row["name"] for row in data["terms"]]
random.shuffle(proofs)

In [5]:
# Create a data frame with index beeing the list of terms & columns being the list of Types

df = pd.DataFrame(index = proofs)

for statement in statements:
    df[statement] = [0]*len(df)

In [6]:
# Training on the 'train' data
for val in train:
    df.loc[val["terms"], val["types"]] += 1
    
# Get the Correlation dataframe (to be used while getting recommendations)
df_corr = df.corr(method='spearman')

In [7]:
# Function to return list of all 'terms' based on their decreasing order of priorities for a given list of 'types'

def get_recom(statement):
    corr = df_corr[statement]
    corr = corr.mean(axis=1)
    wght = df * corr

    wght["mean"] = wght.mean(axis=1)
    final = wght[["mean"]]
    final = final.sort_values(by=["mean"], ascending=False)

    return final.index.tolist()

In [8]:
# function to get score (average rank of terms in the recommended list)

def get_score(triple, single = True):
    if triple["terms"] == [] or triple["types"] == []:
        return "terms_or_types_is_empty"
    
    else:
        recom = get_recom(triple["types"])

        pos_recom = []
        pos_norm = []

        for term in triple["terms"]:
            pos_recom.append(recom.index(term))
            pos_norm.append(df.index.to_list().index(term))

        avg_recom = mean(pos_recom)
        avg_norm = mean(pos_norm)
        
        if single:
            return avg_recom
        else:
            return (avg_recom, avg_norm)

In [9]:
# function to calculate the average score for more than one test data

def avg_score(triples, graph = False):
    print(f"Calculating average score for {len(triples)} elements.")
    print("element\t(avg_recom, avg_norm)")
    
    i = 0
    list_recom = []
    list_index = []
    
    for triple in triples:
        score = get_score(triple, False)
        if score != "terms_or_types_is_empty":
            list_recom.append(score)
            list_index.append(score[0])
        else:
            list_recom.append((0, 0))
            list_index.append(0)
        i+=1
        #print(f" {i} : {score}")
    
    avg_per = mean(list_index)
    print(f" Average rank of terms present in the proof of {len(list_recom)} elements is {round(avg_per)} out of {len(data['terms'])} total terms.")
    
    return avg_per

In [10]:
avg_score(test)

Calculating average score for 150 elements.
element	(avg_recom, avg_norm)
 Average rank of terms present in the proof of 150 elements is 317 out of 3337 total terms.


316.83887671212926

In [11]:
466 394 487 504

SyntaxError: invalid syntax (<ipython-input-11-8f25085885fb>, line 1)

Here the average score (x) represents that it only requires x% of the time (ordering) when using the recommended list to find the terms used in the proof.
i.e. the average score is 100% if the 'terms' for a proof would have been found using normal 'terms' list.

In [None]:
# function to calculate the average score for more than one test data (used for plotting graph)

def avg_score_graph(triples):
    
    list_recom = []
    list_norm = []
    
    for triple in triples:
        score = get_score(triple, False)
        if score != "terms_or_types_is_empty":
            list_recom.append(score[0])
            list_norm.append(score[1])
    
    return list_recom, list_norm

In [None]:
scores_recom, scores_norm = avg_score_graph(test)
test_no = list(range(1,len(scores_norm)+1))

plt.figure(figsize=[15,15])
plt.plot(test_no, scores_recom, 'g*', label="Recommended_index")
plt.axhline(y = mean(scores_recom), color = 'g', ls = 'dotted', label="Avg_recommended_index")

plt.plot(test_no, scores_norm, 'r+', label="Normal_index")
plt.axhline(y = mean(scores_norm), color = 'r', ls = 'dotted', label="Avg_normal_index")

plt.axhline(y = 12, color = 'b', ls = 'dotted', label="Target_index")

plt.legend(loc="upper left")

plt.xlabel('Proofs')
plt.ylabel('Index of Terms')

plt.title('Index Plot')

In [None]:
def table_gen(triple):
    if triple["terms"] == [] or triple["types"] == []:
        return "terms_or_types_is_empty"
    
    else:
        recom = get_recom(triple["types"])

        pos_recom = []
        for term in triple["terms"]:
            pos_recom.append(recom.index(term))

        max_index = max(pos_recom)
        mean_index = mean(pos_recom)

        return (triple["name"], triple["types"], triple["terms"], recom[:max_index+1], pos_recom, round(mean_index,2), max_index)

In [None]:
def table_gen_top(triples, top):
    list_rows = []
    
    for triple in triples:
        values = table_gen(triple)
        if values != "terms_or_types_is_empty":
            list_rows.append(values)
    
    table_df = pd.DataFrame(list_rows, columns =['name', 'Types', 'Actual_terms', 'Top_pred_terms', 'Pos_of_actual_term_in_pred (out_of 3337 terms)', 'Avg_pred_index', 'Max_pred_index'])
    table_df = table_df.set_index('name')
    table_df = table_df.sort_values('Avg_pred_index')
    
    table_df = table_df.head(top)
    
    # Generate csv and json file for pretty view
    table_df.to_csv('top_pred.csv')
    table_df.to_json('top_pred.json', orient='index', indent=3)
    
    return table_df

In [None]:
# Generating table for best 20 (can be changed by changing top value in fun) prediction

display(table_gen_top(test, top = 20))