In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from scipy.spatial.distance import hamming

In [2]:
models = pd.read_csv('models.csv', index_col='model_id')
products = pd.read_csv('equities.csv',index_col='Symbol')
holdings = pd.read_csv('model_holdings.csv', index_col=0)

In [3]:
df = models.drop(['account_name','model_name'],axis=1)

In [4]:
df = df.drop(df.columns[0], axis=1)

In [5]:
# Check if any model has null values in any column
df1 = df[df.isnull().any(axis=1)]

In [6]:
# drop the models which have null values in any column
df = df.dropna()

In [7]:
#round the volatility column value
df.volatility = df.volatility.round(2)

In [8]:
model_json = '{"inv_horizon":2.0,"inv_obj_least_imp":0.0,"inv_obj_most_imp":2.0,"inv_obj_some_imp":0.0,"inv_obj_very_imp":4.0,"investment_amt":745791.0,"liquidity_need":2.0,"primary_fin_need":7.0,"risk_profile":2.0,"risk_tolerance":0.0,"volatility":13.94}'

In [9]:
def findSimilarModel(model_json, df):
    input_model = pd.read_json(model_json, typ='series')
    input_model_df = input_model.to_frame().transpose()
    prev_similarity = 0
    final_ind = 0
    model_id = 0
    for i in range(len(df)) : 
        similarity = cosine(input_model_df.values,df.iloc[i,:])
        if(similarity <= prev_similarity):
            prev_similarity = similarity
            model_id = i
    return model_id



In [10]:
products = products.drop(products.columns[0], axis=1)

In [11]:
def productMeta(symbol):
    symbol = symbol
    name = products.at[symbol,"Name"]
    sector = products.at[symbol,"Sector"]
    close_price = products.at[symbol,"close_price"]
    risk_score = products.at[symbol,"risk_score"]
    L2 = products.at[symbol,"L2"]
    return symbol, name, sector, close_price, risk_score, L2

In [12]:
productMeta('AAPL')

('AAPL', 'Apple Inc.', 'Technology', 308.95, 23.69, 'Large Cap')

In [13]:
def favProducts(model_id, N):
    productRatings = holdings[holdings['model_id']==model_id]
    sortedRatings = pd.DataFrame.sort_values(productRatings,['percent'], ascending=[0])[:N]
    print(sortedRatings)
    sortedRatings['title'] = sortedRatings["product_id"].apply(productMeta)
    return sortedRatings

In [14]:
modelProductRatingMatrix = pd.pivot_table(holdings, values='percent', index=['model_id'],columns=['product_id'])

In [15]:
def distance(model1, model2):
    try:
        model1Ratings = modelProductRatingMatrix.transpose()[model1]
        #print(model1Ratings)
        model2Ratings = modelProductRatingMatrix.transpose()[model2]
        #print(model1Ratings)
        distance = hamming(model1Ratings,model2Ratings)
    except: 
        distance = np.NaN
    return distance

In [21]:
def nearestNeighbors(model_id, K=10):
    allModels = pd.DataFrame(modelProductRatingMatrix.index)
    allModels = allModels[allModels.model_id != model_id]
    allModels['distance'] = allModels['model_id'].apply(lambda x: distance(model_id, x))
    KnearestModels = allModels.sort_values(['distance'], ascending=True).index[:K]
    print(KnearestModels)
    return KnearestModels


In [22]:
def topNProducts(model_json, N=10):
    model_id = findSimilarModel(model_json, df)
    KnearestModels = nearestNeighbors(model_id)
    NNRatings = modelProductRatingMatrix[modelProductRatingMatrix.index.isin(KnearestModels)]
    avgRating = NNRatings.apply(np.nanmean).dropna()
    topNProducts = avgRating.sort_values(ascending=False).index[:N]
    return pd.Series(topNProducts)

In [25]:
result = topNProducts(model_json,10)
result_json = result.to_json(orient='records')
print(result_json)

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')
["UEIC","PRNB","RGCO","JAZZ","ZION","HBAN","MTC","GRIN","NXGN","SEEL"]


In [19]:
favProducts(21,10)

     model_id  percent product_id
252        21       17       FUSB
243        21       10       PRFZ
250        21       10       KVHI
246        21        9       AUDC
247        21        9       NXGN
249        21        8       EXPD
248        21        7       LIVN
251        21        7       MATW
242        21        6       MLHR
244        21        6       RMCF


Unnamed: 0,model_id,percent,product_id,title
252,21,17,FUSB,"(FUSB, First US Bancshares, Inc., Finance, 10...."
243,21,10,PRFZ,"(PRFZ, Invesco FTSE RAFI US 1500 Small-Mid ETF..."
250,21,10,KVHI,"(KVHI, KVH Industries, Inc., Technology, 10.88..."
246,21,9,AUDC,"(AUDC, AudioCodes Ltd., Public Utilities, 26.6..."
247,21,9,NXGN,"(NXGN, NextGen Healthcare, Inc., Technology, 1..."
249,21,8,EXPD,"(EXPD, Expeditors International of Washington,..."
248,21,7,LIVN,"(LIVN, LivaNova PLC, Health Care, 69.69, 43.08..."
251,21,7,MATW,"(MATW, Matthews International Corporation, Cap..."
242,21,6,MLHR,"(MLHR, Herman Miller, Inc., Consumer Durables,..."
244,21,6,RMCF,"(RMCF, Rocky Mountain Chocolate Factory, Inc.,..."


In [20]:
models

KeyError: 21