In [1]:
import pandas as pd
from scipy.sparse import csr_matrix



In [2]:
data = pd.read_csv('Preply_tutor_views_datasaet.csv') 
print(data.shape)
data.head()

(9999, 3)


Unnamed: 0,id,user_id,tutor_id
0,1,00000055a78bf6735c4a89358fab1de34104c3cb,e78de9dad70d230a096f0bbdc3e89b5cae04ba77
1,2,00000055a78bf6735c4a89358fab1de34104c3cb,b9a521730141de9bc4fe8ebc9f33713411d0101a
2,3,00000055a78bf6735c4a89358fab1de34104c3cb,8378136c6dd0e03be859a210a0cee03955951fb1
3,4,00000055a78bf6735c4a89358fab1de34104c3cb,9cc68d8345f675892bcab0fad02f65b4ac7e71ea
4,5,00000055a78bf6735c4a89358fab1de34104c3cb,a8272c62cd05d5b882e4f630fb55cfa0ba8491e6


## Prepare data

In [3]:
data_grouped = data.groupby(['user_id', 'tutor_id']).size().reset_index(name='counts')
data_grouped.head()

Unnamed: 0,user_id,tutor_id,counts
0,00000055a78bf6735c4a89358fab1de34104c3cb,5c61cd1b82ec7a4d2918a6de99fcd1577b462f79,1
1,00000055a78bf6735c4a89358fab1de34104c3cb,8378136c6dd0e03be859a210a0cee03955951fb1,1
2,00000055a78bf6735c4a89358fab1de34104c3cb,9cc68d8345f675892bcab0fad02f65b4ac7e71ea,1
3,00000055a78bf6735c4a89358fab1de34104c3cb,a1437d6393ee9535248b16f27a649bbd98c9e2f5,1
4,00000055a78bf6735c4a89358fab1de34104c3cb,a8272c62cd05d5b882e4f630fb55cfa0ba8491e6,1


In [4]:
# pivot data_grouped into tutor features
df_tutor_features = data_grouped.pivot(index='tutor_id', columns='user_id',
                                       values='counts').fillna(0)

mat_tutor_features = csr_matrix(df_tutor_features.values)

In [5]:
# mapper 
df_tutors = data[['tutor_id']].drop_duplicates()
df_tutors['title'] = data['tutor_id']
tutors_to_idx = {
    tutor: i for i, tutor in 
    enumerate(list(df_tutors.set_index('tutor_id').loc[df_tutor_features.index].title))
}

## Model KNN

In [6]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)

# fit model
model_knn.fit(mat_tutor_features)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=10)

In [7]:
def make_recommendation(model_knn, data, mapper, my_tutor, n_recommendations):
    """
    return top n similar tutor recommendations based on user's input movie
    Parameters
    ----------
    model_knn: sklearn model, knn model
    data: tutor-user matrix
    mapper: dict, map tutor ID to index in data
    my_tutor: str, ID of input tutor
    n_recommendations: int, top n recommendations
    Return
    ------
    list of top n similar tutor recommendations
    """
    # fit
    model_knn.fit(data)
    # get input tutor ID index
    print('You have input tutor:', my_tutor)
    idx = mapper[ my_tutor ]
    
    print('Recommendation system start to make inference')
    print('......\n')
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
   
    raw_recommends = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format( my_tutor ))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))

In [8]:
my_tutor = 'ff0d3fb21c00bc33f71187a2beec389e9eff5332'

make_recommendation(
    model_knn=model_knn,
    data=mat_tutor_features,
    mapper=tutors_to_idx,
    my_tutor=my_tutor,
    n_recommendations=10)

You have input tutor: ff0d3fb21c00bc33f71187a2beec389e9eff5332
Recommendation system start to make inference
......

Recommendations for ff0d3fb21c00bc33f71187a2beec389e9eff5332:
1: 0cbb51f1f43646c1718553da0c5864d4e1a6f037, with distance of 0.6348516283298892
2: 551ec41539d9fb71200d18ec7903b1039cde594f, with distance of 0.6186149643017631
3: dd3c8fd58366b577ce6b1d0f435602f11671c3dc, with distance of 0.6
4: 85ef93bda0f7fb6327bd1b5ad44da26246b4360d, with distance of 0.5917517095361371
5: 61bc35a6401829bd28a8da47a2f235944ba8d2df, with distance of 0.5774228726357417
6: c093b1743115b3f9d368b2f7bdf54f367afccc7c, with distance of 0.5774228726357417
7: 340f1eaf7ad0c07f1491338ab68cbcab30c315ec, with distance of 0.3675444679663241
8: bdf147e99ee57500eb2dabcbf3cfa24e1daef357, with distance of 0.36099034957730614
9: 6b0cd6a8094daf42e766ea257a2af3571831bb32, with distance of 0.30973151003736654
10: 7ee223009403f7450993fe5d79516f1fc841e75e, with distance of 0.21064778262367367
