The Notebook is intended to show the results of Similar User Model based on Similar User Interest. This is basically a Bag-Of-Words model. This notebook includes similar interest tags as well for displaying the results.

In [18]:
import os
import sys
import pandas as pd

In [19]:
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [21]:
from generator.data.process_data import UserInterestDataProcessor
from generator.data.data_utils import split_with_comma
from generator.models.similar_interest_model import SimilarInterestUserModel

In [22]:
processed_user_interests_df=UserInterestDataProcessor.load_data()

In [27]:
class SimilarInterestUserModel:
    def __init__(self, **params):
        self.params=params
        self.data=None
        self.X=None
        self.user_index_dict={}
    
    def fit(self, data): 
        self.data=data
        self.user_index_dict=create_user_indices_from_user_handle(self.data)
        tf_vectorizer = TfidfVectorizer(self.params)
        self.X = tf_vectorizer.fit_transform(self.data.interest_tag)
        return self
    
    def predict_simiar_users(self, user_handle, num_similar_users=NUM_SIMILAR_USERS):
        user=self.user_index_dict[user_handle]
        cosine_similarities = linear_kernel(self.X[user], self.X).flatten()
        related_docs_indices = cosine_similarities.argsort()[:-num_similar_users:-1]
        data = {'similar_users':self.data.loc[related_docs_indices].user_handle.values, 
                'interest_tag': self.data.loc[related_docs_indices].interest_tag.values,
                'interest_sim_score':cosine_similarities[related_docs_indices]} 
        return pd.DataFrame(data)
    
def create_user_indices_from_user_handle(data):
    user_idx_dict={}
    for idx, user in enumerate(data.user_handle.unique()):
        if user not in user_idx_dict:
            user_idx_dict[user]=idx
    return user_idx_dict
                   
params=dict(analyzer=split_with_comma, min_df=5)  
sm=SimilarInterestUserModel(params=params)
sm.fit(processed_user_interests_df)
sm.predict_simiar_users(user_handle=789)

Unnamed: 0,similar_users,interest_tag,interest_sim_score
0,789,"azure-mobile-services,tdd,windows-azure,sql-se...",1.0
1,7516,"async,azure-mobile-services,nodejs,react.js,pe...",0.727229
2,5490,"azure-mobile-services,performance-optimization...",0.713785
3,6797,"microservices,cloud-computing,microsoft-azure,...",0.699366
4,6672,"azure-mobile-services,react.js,performance-opt...",0.689831
5,2010,"azure-mobile-services,nodejs,website-security,...",0.686675
6,1814,"microsoft-azure,azure-mobile-services,distribu...",0.685142
7,8468,"microsoft-azure,database-administration,perfor...",0.681018
8,1141,"cloud-computing,microsoft-azure,azure-mobile-s...",0.667481


In [None]:
The choice of parameters is somewhat arbitrary and should be tweaked based on inputs of Business Domain Expert