In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

In [2]:
# Step 1: Load the Data
vehicles_df = pd.read_csv('travels.csv')
reviews_df = pd.read_csv('reviews.csv')

In [3]:
vehicles_df.head()

Unnamed: 0,vehicle_id,travel_id,departure_at,from,to
0,65dda7755e43b9d7d3e4f3f9,65dda7765e43b9d7d3e4f48e,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
1,65dda7765e43b9d7d3e4f4bf,65dda7795e43b9d7d3e4f67d,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
2,65dda7775e43b9d7d3e4f4e6,65dda77a5e43b9d7d3e4f6df,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
3,65dda7775e43b9d7d3e4f529,65dda77a5e43b9d7d3e4f74b,2024-03-03,"Pathri, Morong","Koteshowr, Kathmandu"
4,65dda7775e43b9d7d3e4f534,65dda77a5e43b9d7d3e4f753,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"


In [4]:
reviews_df.rename(columns={"id":'review_id'},inplace=True)

In [5]:
reviews_df.head()

Unnamed: 0,review_id,vehicle_id,user_id,rating
0,65dda7bea276ad551e8d07b2,65dda77d5e43b9d7d3e4f977,91,5
1,65dda7bea276ad551e8d07b3,65dda77f5e43b9d7d3e4fa4d,196,4
2,65dda7bea276ad551e8d07b4,65dda7855e43b9d7d3e4fe8c,180,3
3,65dda7bea276ad551e8d07b5,65dda7795e43b9d7d3e4f6a7,136,4
4,65dda7bea276ad551e8d07b6,65dda77c5e43b9d7d3e4f87f,141,3


In [6]:
print(reviews_df.shape)
print(vehicles_df.shape)

(120000, 4)
(103, 5)


In [7]:
# Step 2: Preprocess the Data
vehicles_with_reviews = pd.merge(reviews_df, vehicles_df, on='vehicle_id')

In [8]:
vehicles_with_reviews

Unnamed: 0,review_id,vehicle_id,user_id,rating,travel_id,departure_at,from,to
0,65dda7bea276ad551e8d07b2,65dda77d5e43b9d7d3e4f977,91,5,65dda7815e43b9d7d3e4fbe6,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
1,65dda7bea276ad551e8d07ff,65dda77d5e43b9d7d3e4f977,54,3,65dda7815e43b9d7d3e4fbe6,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
2,65dda7bea276ad551e8d0806,65dda77d5e43b9d7d3e4f977,154,5,65dda7815e43b9d7d3e4fbe6,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
3,65dda7bea276ad551e8d090c,65dda77d5e43b9d7d3e4f977,55,3,65dda7815e43b9d7d3e4fbe6,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
4,65dda7bea276ad551e8d0926,65dda77d5e43b9d7d3e4f977,192,5,65dda7815e43b9d7d3e4fbe6,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
...,...,...,...,...,...,...,...,...
119995,65dda7c7a276ad551e8ed8b7,65dda7835e43b9d7d3e4fd25,45,3,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
119996,65dda7c7a276ad551e8ed9a8,65dda7835e43b9d7d3e4fd25,71,3,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
119997,65dda7c7a276ad551e8ed9bb,65dda7835e43b9d7d3e4fd25,147,3,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
119998,65dda7c7a276ad551e8edbcd,65dda7835e43b9d7d3e4fd25,2,3,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"


# Collaborative Filtering Recommendation System

In [9]:
# Step 4: Generate Pivot table matrix
user_ratings = vehicles_with_reviews.pivot_table(index='user_id', columns='vehicle_id', values='rating').fillna(0)

In [10]:
user_ratings

vehicle_id,65dda7755e43b9d7d3e4f3f9,65dda7765e43b9d7d3e4f4bf,65dda7775e43b9d7d3e4f4d3,65dda7775e43b9d7d3e4f4e6,65dda7775e43b9d7d3e4f4ea,65dda7775e43b9d7d3e4f4ef,65dda7775e43b9d7d3e4f4fb,65dda7775e43b9d7d3e4f513,65dda7775e43b9d7d3e4f529,65dda7775e43b9d7d3e4f534,...,65dda7845e43b9d7d3e4fe17,65dda7845e43b9d7d3e4fe34,65dda7855e43b9d7d3e4fe52,65dda7855e43b9d7d3e4fe6f,65dda7855e43b9d7d3e4fe8c,65dda7855e43b9d7d3e4fead,65dda7855e43b9d7d3e4fecb,65dda7855e43b9d7d3e4fee8,65dda7865e43b9d7d3e4ff06,65dda7865e43b9d7d3e4ff24
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.600000,3.333333,3.500000,3.000000,3.250000,3.000000,3.666667,3.666667,3.500000,3.714286,...,3.200000,3.777778,3.200000,3.500000,2.833333,4.000000,3.857143,3.400000,3.500000,2.333333
2,4.250000,3.000000,3.444444,2.500000,3.000000,3.250000,4.000000,5.000000,3.428571,3.800000,...,4.000000,4.142857,3.571429,3.142857,4.000000,3.000000,3.600000,2.000000,3.545455,3.555556
3,3.400000,3.285714,3.666667,3.600000,2.500000,3.250000,3.500000,3.000000,2.857143,5.000000,...,3.875000,3.666667,3.750000,2.666667,2.714286,2.857143,3.600000,3.250000,4.000000,2.714286
4,3.000000,3.384615,3.833333,3.400000,3.714286,3.125000,2.875000,3.000000,4.000000,3.500000,...,3.750000,3.285714,4.000000,3.500000,3.400000,4.000000,3.333333,2.833333,3.666667,4.000000
5,4.000000,3.333333,4.285714,3.250000,3.833333,3.200000,3.750000,3.000000,2.000000,3.666667,...,3.142857,3.454545,3.714286,4.500000,3.500000,3.333333,3.285714,3.333333,3.500000,2.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,4.000000,3.250000,4.200000,3.125000,4.250000,2.333333,3.666667,3.000000,3.000000,3.000000,...,3.166667,3.666667,4.333333,3.500000,4.000000,3.400000,3.400000,4.142857,3.833333,4.600000
197,3.333333,2.600000,3.500000,3.250000,3.500000,3.800000,4.666667,3.300000,2.875000,3.250000,...,3.444444,3.285714,2.000000,3.000000,2.666667,3.333333,4.333333,3.285714,3.250000,3.714286
198,3.200000,3.000000,3.714286,3.285714,3.800000,3.666667,3.500000,4.500000,3.666667,3.750000,...,3.777778,3.363636,4.333333,3.625000,3.250000,3.333333,3.333333,4.000000,3.000000,3.571429
199,3.500000,4.000000,3.500000,4.000000,2.428571,3.250000,5.000000,2.500000,2.666667,3.500000,...,3.500000,3.250000,3.500000,4.000000,3.625000,3.166667,3.571429,2.750000,3.615385,3.571429


In [11]:
# Explanation: https://youtu.be/3ecNC-So0r4?t=785
# Let's say that auth user have not given rating to some vehicle and it doesn't mean that user have gave 0 start rating to it
# So we will create method which will standarized the ratings given by all the users
def standardize(row):
    # takes single rows from dataframe as argument
    # now create a new row from (original rating minus(-) mean of all the ratings & divide it by the range of ratings that the user gives)
    new_row = (row - row.mean()) / (row.max() - row.min()) 
    return new_row

ratings_std = user_ratings.apply(standardize)
ratings_std

vehicle_id,65dda7755e43b9d7d3e4f3f9,65dda7765e43b9d7d3e4f4bf,65dda7775e43b9d7d3e4f4d3,65dda7775e43b9d7d3e4f4e6,65dda7775e43b9d7d3e4f4ea,65dda7775e43b9d7d3e4f4ef,65dda7775e43b9d7d3e4f4fb,65dda7775e43b9d7d3e4f513,65dda7775e43b9d7d3e4f529,65dda7775e43b9d7d3e4f534,...,65dda7845e43b9d7d3e4fe17,65dda7845e43b9d7d3e4fe34,65dda7855e43b9d7d3e4fe52,65dda7855e43b9d7d3e4fe6f,65dda7855e43b9d7d3e4fe8c,65dda7855e43b9d7d3e4fead,65dda7855e43b9d7d3e4fecb,65dda7855e43b9d7d3e4fee8,65dda7865e43b9d7d3e4ff06,65dda7865e43b9d7d3e4ff24
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.024379,-0.023697,0.002352,-0.169814,-0.081205,-0.082675,0.033856,0.020504,0.018699,0.081482,...,-0.100441,0.062335,-0.091604,0.016200,-0.212121,0.108028,0.068273,-0.003307,0.001961,-0.218761
2,0.154379,-0.093872,-0.016166,-0.336480,-0.174955,-0.032675,0.100523,0.287170,-0.005110,0.110053,...,0.166226,0.135351,0.032206,-0.102847,0.176768,-0.091972,0.016844,-0.283307,0.017112,0.025683
3,-0.015621,-0.033722,0.057908,0.030186,-0.362455,-0.032675,0.000523,-0.112830,-0.195586,0.510053,...,0.124559,0.040113,0.091729,-0.261577,-0.251804,-0.120543,0.016844,-0.033307,0.168627,-0.142571
4,-0.095621,-0.012900,0.113463,-0.036480,0.092902,-0.057675,-0.124477,-0.112830,0.185366,0.010053,...,0.082893,-0.036078,0.175063,0.016200,-0.023232,0.108028,-0.036489,-0.116640,0.057516,0.114572
5,0.104379,-0.023697,0.264257,-0.086480,0.137545,-0.042675,0.050523,-0.112830,-0.481301,0.065609,...,-0.119488,-0.002311,0.079825,0.349534,0.010101,-0.025305,-0.046013,-0.016640,0.001961,-0.185428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,0.104379,-0.041240,0.235685,-0.128147,0.293795,-0.216009,0.033856,-0.112830,-0.147967,-0.156613,...,-0.111552,0.040113,0.286174,0.016200,0.176768,-0.011972,-0.023156,0.145265,0.113072,0.234572
197,-0.028955,-0.178083,0.002352,-0.086480,0.012545,0.077325,0.233856,-0.052830,-0.189634,-0.073280,...,-0.018959,-0.036078,-0.491604,-0.150466,-0.267677,-0.025305,0.163511,-0.026164,-0.081373,0.057429
198,-0.055621,-0.093872,0.073781,-0.074576,0.125045,0.050658,0.000523,0.187170,0.074255,0.093387,...,0.092152,-0.020493,0.286174,0.057867,-0.073232,-0.025305,-0.036489,0.116693,-0.164706,0.028858
199,0.004379,0.116654,0.002352,0.163520,-0.389241,-0.032675,0.300523,-0.212830,-0.259078,0.010053,...,-0.000441,-0.043221,0.008396,0.182867,0.051768,-0.058638,0.011130,-0.133307,0.040422,0.028858


In [12]:
# Step 4: Compute User to User Similarities
# Cosine similaritis over here will calculate over row wise means User to User 
similarity_matrix = cosine_similarity(ratings_std) 

# or if you wanted you can do Vehicles to Vehicles similarites for that you have to transpose
vehicles_similarity_matrix = cosine_similarity(ratings_std.T)

In [13]:
similarity_matrix.shape

(200, 200)

In [14]:
similarity_matrix

array([[ 1.        ,  0.16398542, -0.06160727, ...,  0.01284502,
        -0.02617014,  0.0068169 ],
       [ 0.16398542,  1.        , -0.05858037, ...,  0.06718185,
        -0.02627877,  0.13205168],
       [-0.06160727, -0.05858037,  1.        , ...,  0.03877683,
        -0.11480534, -0.04122942],
       ...,
       [ 0.01284502,  0.06718185,  0.03877683, ...,  1.        ,
        -0.18177214,  0.04045485],
       [-0.02617014, -0.02627877, -0.11480534, ..., -0.18177214,
         1.        , -0.00164825],
       [ 0.0068169 ,  0.13205168, -0.04122942, ...,  0.04045485,
        -0.00164825,  1.        ]])

In [15]:
vehicles_similarity_matrix.shape

(103, 103)

In [16]:
vehicles_similarity_matrix

array([[ 1.        ,  0.19835287,  0.10797364, ..., -0.04406638,
         0.12597894,  0.02520853],
       [ 0.19835287,  1.        , -0.02398161, ..., -0.01755617,
         0.02922688, -0.01631652],
       [ 0.10797364, -0.02398161,  1.        , ...,  0.1303837 ,
         0.06423217, -0.00981477],
       ...,
       [-0.04406638, -0.01755617,  0.1303837 , ...,  1.        ,
         0.07001278,  0.0247252 ],
       [ 0.12597894,  0.02922688,  0.06423217, ...,  0.07001278,
         1.        ,  0.07561356],
       [ 0.02520853, -0.01631652, -0.00981477, ...,  0.0247252 ,
         0.07561356,  1.        ]])

In [17]:
# Creating dataframe out of numpy array from vehicles_similarity_matrix so that we can use mor easily
vehicles_similarity_matrix_df = pd.DataFrame(vehicles_similarity_matrix, index=ratings_std.columns, columns=ratings_std.columns)
vehicles_similarity_matrix_df
# Here '65dda7765e43b9d7d3e4f4bf' similar to '65dda7765e43b9d7d3e4f4bf' with 100%
# '65dda7765e43b9d7d3e4f4bf' similar to '65dda7775e43b9d7d3e4f4d3' with 97%
# Explanation: https://youtu.be/3ecNC-So0r4?t=1072

vehicle_id,65dda7755e43b9d7d3e4f3f9,65dda7765e43b9d7d3e4f4bf,65dda7775e43b9d7d3e4f4d3,65dda7775e43b9d7d3e4f4e6,65dda7775e43b9d7d3e4f4ea,65dda7775e43b9d7d3e4f4ef,65dda7775e43b9d7d3e4f4fb,65dda7775e43b9d7d3e4f513,65dda7775e43b9d7d3e4f529,65dda7775e43b9d7d3e4f534,...,65dda7845e43b9d7d3e4fe17,65dda7845e43b9d7d3e4fe34,65dda7855e43b9d7d3e4fe52,65dda7855e43b9d7d3e4fe6f,65dda7855e43b9d7d3e4fe8c,65dda7855e43b9d7d3e4fead,65dda7855e43b9d7d3e4fecb,65dda7855e43b9d7d3e4fee8,65dda7865e43b9d7d3e4ff06,65dda7865e43b9d7d3e4ff24
vehicle_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
65dda7755e43b9d7d3e4f3f9,1.000000,0.198353,0.107974,0.050121,-0.129653,0.026379,-0.005630,0.019951,-0.062673,-0.015780,...,-0.011232,0.011049,-0.067388,0.017294,0.136889,0.147143,-0.035366,-0.044066,0.125979,0.025209
65dda7765e43b9d7d3e4f4bf,0.198353,1.000000,-0.023982,-0.030713,-0.043227,0.009154,0.023939,0.010267,0.011186,0.099951,...,-0.004082,0.052049,0.077645,0.045717,0.006917,0.029482,0.029404,-0.017556,0.029227,-0.016317
65dda7775e43b9d7d3e4f4d3,0.107974,-0.023982,1.000000,0.014555,-0.000135,-0.076343,0.065037,-0.060054,-0.007134,0.013093,...,-0.017202,0.004370,0.029571,0.050792,0.062098,-0.149178,0.087125,0.130384,0.064232,-0.009815
65dda7775e43b9d7d3e4f4e6,0.050121,-0.030713,0.014555,1.000000,-0.113590,-0.067225,0.071979,0.020889,-0.108720,0.004932,...,-0.006995,0.031617,-0.059785,0.102149,0.110645,-0.014905,-0.014105,0.128684,0.180413,0.169692
65dda7775e43b9d7d3e4f4ea,-0.129653,-0.043227,-0.000135,-0.113590,1.000000,0.077221,0.031586,-0.004886,-0.094156,-0.071225,...,-0.104831,-0.123031,0.135604,-0.039622,-0.022971,0.028613,-0.033465,0.106229,-0.098971,-0.004707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65dda7855e43b9d7d3e4fead,0.147143,0.029482,-0.149178,-0.014905,0.028613,-0.002831,-0.086876,-0.010508,0.033319,0.042402,...,-0.084111,-0.002555,0.033203,-0.121380,0.068136,1.000000,-0.025963,-0.035056,0.004803,-0.139972
65dda7855e43b9d7d3e4fecb,-0.035366,0.029404,0.087125,-0.014105,-0.033465,-0.097658,-0.005426,0.024526,0.053337,-0.024528,...,0.004608,0.051395,-0.131844,0.010030,-0.074501,-0.025963,1.000000,-0.123420,0.041274,0.033676
65dda7855e43b9d7d3e4fee8,-0.044066,-0.017556,0.130384,0.128684,0.106229,0.012760,0.084639,-0.029640,-0.017982,-0.139382,...,-0.069710,0.018426,-0.037999,-0.081986,0.027393,-0.035056,-0.123420,1.000000,0.070013,0.024725
65dda7865e43b9d7d3e4ff06,0.125979,0.029227,0.064232,0.180413,-0.098971,-0.004735,-0.108877,-0.110867,0.051255,-0.043850,...,0.074542,0.015144,-0.049913,0.048253,0.100660,0.004803,0.041274,0.070013,1.000000,0.075614


In [18]:
# NOTE: bellow here are all the code explanation that is done on 'get_recommendations' function
# filters the DataFrame searched_vehicles to get only the rows where the user_id matches the given user_id 8. It extracts all reviews made by the user.
user_id = 8
user_reviews = vehicles_with_reviews[vehicles_with_reviews['user_id'] == user_id]
user_reviews

Unnamed: 0,review_id,vehicle_id,user_id,rating,travel_id,departure_at,from,to
198,65dda7bea276ad551e8d5675,65dda77d5e43b9d7d3e4f977,8,2,65dda7815e43b9d7d3e4fbe6,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
217,65dda7bea276ad551e8d5d09,65dda77d5e43b9d7d3e4f977,8,3,65dda7815e43b9d7d3e4fbe6,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
766,65dda7c3a276ad551e8e3963,65dda77d5e43b9d7d3e4f977,8,4,65dda7815e43b9d7d3e4fbe6,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
1001,65dda7c5a276ad551e8e9ac3,65dda77d5e43b9d7d3e4f977,8,2,65dda7815e43b9d7d3e4fbe6,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
1033,65dda7c5a276ad551e8eaf08,65dda77d5e43b9d7d3e4f977,8,3,65dda7815e43b9d7d3e4fbe6,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
...,...,...,...,...,...,...,...,...
119070,65dda7bea276ad551e8d71d4,65dda7835e43b9d7d3e4fd25,8,3,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
119127,65dda7c0a276ad551e8d899d,65dda7835e43b9d7d3e4fd25,8,5,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
119320,65dda7c0a276ad551e8dd654,65dda7835e43b9d7d3e4fd25,8,2,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
119392,65dda7c3a276ad551e8dedfc,65dda7835e43b9d7d3e4fd25,8,4,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"


In [19]:
# This line creates a Pandas Series called user_ratings. 
# It extracts the rating values from the user_reviews DataFrame and sets the vehicle_id values as the index of the Series. 
# This Series represents the ratings given by the user to different vehicles.
user_ratings = pd.Series(user_reviews.rating.values, index=user_reviews.vehicle_id.values)
# here user could have rated the same vehicle multiple times so, we will going to take an average of that
user_ratings = user_reviews.groupby('vehicle_id')['rating'].mean()
user_ratings.shape

(102,)

In [20]:
# geting round values to convert it into integer
# user_ratings = user_ratings.round().astype(int)
user_ratings

vehicle_id
65dda7755e43b9d7d3e4f3f9    3.625000
65dda7765e43b9d7d3e4f4bf    3.600000
65dda7775e43b9d7d3e4f4d3    3.166667
65dda7775e43b9d7d3e4f4e6    3.250000
65dda7775e43b9d7d3e4f4ea    3.400000
                              ...   
65dda7855e43b9d7d3e4fead    2.000000
65dda7855e43b9d7d3e4fecb    3.666667
65dda7855e43b9d7d3e4fee8    3.600000
65dda7865e43b9d7d3e4ff06    3.600000
65dda7865e43b9d7d3e4ff24    5.000000
Name: rating, Length: 102, dtype: float64

In [21]:
## Let's make recommendations
# This function taks vehicle_id & the rating that the auth user ex '8' have given on the vehicle_id on the past
def get_similar_vehicles(vehicle_id,user_rating):
    # let's get row of datafrom from 'vehicles_similarity_matrix_df' using the index 'vehicle_id' 
    # which means get row from the vehicle which user have already rated
    similar_score = vehicles_similarity_matrix_df[vehicle_id]
    # Now scale it by the rating that the user have given for that particular vehicle
#     similar_score = similar_score*user_rating
#     now arranging it in decending order
    similar_score = similar_score*(user_rating-2.5) # explanation is bellow next cell from this cell
    similar_score = similar_score.sort_values(ascending=False)
    # This method will return a similarity score for all the vehicles that is similar to the given vehicle 'vehicle_id'
    return similar_score



In [22]:
# Testing for the first user rated vehicle
first_vehicle_id = user_ratings.index[0]
first_vehicle_rating = user_ratings.iloc[0]


get_similar_vehicles(first_vehicle_id, first_vehicle_rating)
# Let's say that user give rating 1 to vehicle 'first_vehicle_id' 
# Explanation: https://youtu.be/3ecNC-So0r4?t=1225
# If the user have rated the vehicles bad then we want all other similar vehicles to go down in the list
# and other vehicle that is not similar to this vehicle to come at the top of the list
# we can achieve this in above function like this:
# similar_score = similar_score*(user_rating-2.5)
get_similar_vehicles(first_vehicle_id, 1)

vehicle_id
65dda7775e43b9d7d3e4f54d    0.286131
65dda7785e43b9d7d3e4f5cd    0.275026
65dda7775e43b9d7d3e4f53f    0.254408
65dda77a5e43b9d7d3e4f712    0.243421
65dda77c5e43b9d7d3e4f86f    0.202321
                              ...   
65dda7805e43b9d7d3e4fb98   -0.227551
65dda77e5e43b9d7d3e4f9c1   -0.251646
65dda7765e43b9d7d3e4f4bf   -0.297529
65dda77c5e43b9d7d3e4f87f   -0.379955
65dda7755e43b9d7d3e4f3f9   -1.500000
Name: 65dda7755e43b9d7d3e4f3f9, Length: 103, dtype: float64

In [23]:
# converting 'user_ratings' into list of touples
user_ratings_list = list(user_ratings.items())
user_ratings_list

[('65dda7755e43b9d7d3e4f3f9', 3.625),
 ('65dda7765e43b9d7d3e4f4bf', 3.6),
 ('65dda7775e43b9d7d3e4f4d3', 3.1666666666666665),
 ('65dda7775e43b9d7d3e4f4e6', 3.25),
 ('65dda7775e43b9d7d3e4f4ea', 3.4),
 ('65dda7775e43b9d7d3e4f4ef', 3.3333333333333335),
 ('65dda7775e43b9d7d3e4f4fb', 4.0),
 ('65dda7775e43b9d7d3e4f513', 3.090909090909091),
 ('65dda7775e43b9d7d3e4f529', 3.125),
 ('65dda7775e43b9d7d3e4f534', 3.8),
 ('65dda7775e43b9d7d3e4f53f', 3.8333333333333335),
 ('65dda7775e43b9d7d3e4f54d', 3.0833333333333335),
 ('65dda7775e43b9d7d3e4f557', 3.6666666666666665),
 ('65dda7775e43b9d7d3e4f569', 3.857142857142857),
 ('65dda7775e43b9d7d3e4f570', 3.0),
 ('65dda7785e43b9d7d3e4f583', 3.0),
 ('65dda7785e43b9d7d3e4f58b', 3.25),
 ('65dda7785e43b9d7d3e4f5a7', 3.375),
 ('65dda7785e43b9d7d3e4f5c2', 3.6),
 ('65dda7785e43b9d7d3e4f5cd', 3.4444444444444446),
 ('65dda7785e43b9d7d3e4f5df', 2.5714285714285716),
 ('65dda7785e43b9d7d3e4f5e1', 4.166666666666667),
 ('65dda7785e43b9d7d3e4f5ff', 2.0),
 ('65dda7785e43b9

In [24]:
# Now we now all the vehicle that auth user have rated is stored on 'user_ratings'

similar_vehicles_df = pd.DataFrame()

# Initialize a list to store the DataFrames for each vehicle
dfs = []

# Now we will going to get similarities from every vehicle that user have rated to every other vehicles
for vehicle_id, rating in user_ratings_list:
    # Get similarity DataFrame for the current vehicle and rating
    similarity_df = get_similar_vehicles(vehicle_id, rating)
    # Set the column name to the vehicle_id
    similarity_df.columns = [vehicle_id]
    # Append the DataFrame to the list
    dfs.append(similarity_df)
    
# Concatenate all DataFrames in the list along axis 1
similar_vehicles_df = pd.concat(dfs, axis=1)

# Set the index to be the default integer index
similar_vehicles_df.reset_index(drop=True, inplace=True)
# Each row in this dataframe for each 'vehicle_id' that the user has rated
# And the value in each column for vehicle id is the similarity score for each row index 'vehicle_id'
# It means that every value in this dataframe are the similarity value for each vehicle that user have rated on
similar_vehicles_df

Unnamed: 0,65dda7755e43b9d7d3e4f3f9,65dda7765e43b9d7d3e4f4bf,65dda7775e43b9d7d3e4f4d3,65dda7775e43b9d7d3e4f4e6,65dda7775e43b9d7d3e4f4ea,65dda7775e43b9d7d3e4f4ef,65dda7775e43b9d7d3e4f4fb,65dda7775e43b9d7d3e4f513,65dda7775e43b9d7d3e4f529,65dda7775e43b9d7d3e4f534,...,65dda7845e43b9d7d3e4fe17,65dda7845e43b9d7d3e4fe34,65dda7855e43b9d7d3e4fe52,65dda7855e43b9d7d3e4fe6f,65dda7855e43b9d7d3e4fe8c,65dda7855e43b9d7d3e4fead,65dda7855e43b9d7d3e4fecb,65dda7855e43b9d7d3e4fee8,65dda7865e43b9d7d3e4ff06,65dda7865e43b9d7d3e4ff24
0,1.125000,0.218188,0.071982,0.037591,-0.116687,0.021982,-0.008445,0.011789,-0.039170,-0.020514,...,-0.0,0.007734,-0.067388,0.025940,0.228149,-0.073572,-0.041261,-0.048473,0.138577,0.063021
1,0.284966,0.055004,0.006568,0.104794,-0.048654,-0.077748,0.120483,-0.054963,0.002620,0.019982,...,0.0,0.009011,-0.160526,0.069529,0.031838,0.031000,0.078036,-0.054247,0.071423,-0.036397
2,0.223147,1.100000,-0.015988,-0.023035,-0.038904,0.007628,0.035908,0.006067,0.006991,0.129936,...,-0.0,0.036434,0.077645,0.068576,0.011528,-0.014741,0.034305,-0.019312,0.032150,-0.040791
3,0.188735,0.021407,-0.005694,-0.058181,0.077493,-0.007479,-0.076087,0.052181,-0.025835,-0.097460,...,-0.0,0.004146,-0.060834,-0.162029,-0.017885,-0.102145,0.026933,-0.028583,-0.057523,0.129488
4,0.170663,-0.171014,-0.006160,-0.012769,0.072562,-0.113372,-0.013813,-0.035699,0.013906,-0.062731,...,-0.0,-0.049972,-0.081203,-0.159653,0.064822,-0.061492,-0.028548,0.003250,0.162655,0.326340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,-0.151741,-0.173085,-0.039935,0.064788,0.061804,0.029908,-0.008297,-0.016793,-0.065116,-0.135664,...,-0.0,0.076944,0.043314,0.057912,-0.074540,0.006973,-0.000505,0.054050,0.017887,0.205992
99,-0.182566,-0.048828,-0.052965,-0.002973,-0.059608,-0.050359,0.013814,-0.037907,-0.005116,0.055964,...,0.0,-0.040263,0.007188,0.093861,0.049957,-0.034674,0.065360,-0.061153,0.045839,-0.017223
100,-0.190806,-0.096116,-0.001108,-0.000771,-0.147479,0.053627,-0.033592,-0.044664,0.048811,-0.073863,...,-0.0,0.028454,-0.057930,0.040419,-0.077382,-0.009010,0.055534,0.056669,0.016202,0.012257
101,-0.206270,-0.153101,-0.076187,0.036238,-0.051041,-0.134165,-0.066617,0.058577,-0.028768,0.093194,...,0.0,0.004148,-0.024131,-0.099462,-0.021410,-0.034685,0.040166,-0.031915,0.096897,0.059055


In [25]:
# Now let's sum all the vehicle id values row wise and sort them in decending order
similar_vehicles = similar_vehicles_df.sum().sort_values(ascending=False)
similar_vehicles

65dda7835e43b9d7d3e4fd28    4.404907
65dda7865e43b9d7d3e4ff24    3.884208
65dda7855e43b9d7d3e4fe8c    3.490505
65dda77e5e43b9d7d3e4f9e4    3.086571
65dda7755e43b9d7d3e4f3f9    3.080462
                              ...   
65dda7805e43b9d7d3e4fb02   -0.220296
65dda7795e43b9d7d3e4f684   -0.347906
65dda7785e43b9d7d3e4f5ff   -0.723686
65dda7855e43b9d7d3e4fead   -0.863637
65dda7825e43b9d7d3e4fc48   -0.908247
Length: 102, dtype: float64

In [26]:
recommended_vehicles = similar_vehicles.index.tolist()

In [27]:
recommended_vehicles[:25]

['65dda7835e43b9d7d3e4fd28',
 '65dda7865e43b9d7d3e4ff24',
 '65dda7855e43b9d7d3e4fe8c',
 '65dda77e5e43b9d7d3e4f9e4',
 '65dda7755e43b9d7d3e4f3f9',
 '65dda77e5e43b9d7d3e4fa06',
 '65dda77d5e43b9d7d3e4f952',
 '65dda7835e43b9d7d3e4fd45',
 '65dda77a5e43b9d7d3e4f785',
 '65dda7845e43b9d7d3e4fdba',
 '65dda7815e43b9d7d3e4fbbd',
 '65dda7845e43b9d7d3e4fdfb',
 '65dda7855e43b9d7d3e4fe6f',
 '65dda77c5e43b9d7d3e4f84e',
 '65dda7785e43b9d7d3e4f5e1',
 '65dda77d5e43b9d7d3e4f92e',
 '65dda77d5e43b9d7d3e4f8e8',
 '65dda7825e43b9d7d3e4fcc7',
 '65dda7865e43b9d7d3e4ff06',
 '65dda7805e43b9d7d3e4fb73',
 '65dda7785e43b9d7d3e4f61f',
 '65dda77f5e43b9d7d3e4fa94',
 '65dda77a5e43b9d7d3e4f708',
 '65dda7825e43b9d7d3e4fc89',
 '65dda77b5e43b9d7d3e4f7cf']

In [28]:
# NOTE: Issues that we could face on Collaborative Filtering
# 1. Handling Unknown Users / Vehicles (Cold Start Problem) where we don't know any context about it for rignt now
# 2. Scalalability: Takes havy performance to perform this model
# 3. Dynamic Update: When ever we will add new vehicle or User or Rating Data Then we again have to re build this model
# For that we can create a Hybrid recommendation system to handle this challanges
# Also we can use Papularity based filtering

In [29]:
# Now we will export all the trained dataset into pkl files
import pickle
pickle.dump(vehicles_with_reviews, open("vehicles_with_reviews.pkl", 'wb'))

# Popularity Based Recommended System

In [None]:
# Check whether given user id exist or not or is it None value if not then use Popularity Based recommendation
anonymouse_user_id = None
anonymouse_user_id = 430
anonymous_user_reviews = searched_vehicles[searched_vehicles['user_id'] == anonymouse_user_id]
anonymous_user_reviews.count()["rating"]

In [41]:
# Find rating coung of every vehicles
vehicles_with_ratings_df = vehicles_with_reviews.groupby("vehicle_id").count()["rating"].reset_index()
vehicles_with_ratings_df.rename(columns={"rating":"no_of_rating"}, inplace=True)
vehicles_with_ratings_df

Unnamed: 0,vehicle_id,no_of_rating
0,65dda7755e43b9d7d3e4f3f9,1221
1,65dda7765e43b9d7d3e4f4bf,1176
2,65dda7775e43b9d7d3e4f4d3,1193
3,65dda7775e43b9d7d3e4f4e6,1146
4,65dda7775e43b9d7d3e4f4ea,1122
...,...,...
98,65dda7855e43b9d7d3e4fead,1194
99,65dda7855e43b9d7d3e4fecb,1157
100,65dda7855e43b9d7d3e4fee8,1183
101,65dda7865e43b9d7d3e4ff06,1238


In [42]:
# Find Average rating of every vehicles
average_rating_df = vehicles_with_reviews.groupby("vehicle_id")["rating"].mean().reset_index()
average_rating_df.rename(columns={"rating":"avg_rating"}, inplace=True)
average_rating_df

Unnamed: 0,vehicle_id,avg_rating
0,65dda7755e43b9d7d3e4f3f9,3.486486
1,65dda7765e43b9d7d3e4f4bf,3.460034
2,65dda7775e43b9d7d3e4f4d3,3.516345
3,65dda7775e43b9d7d3e4f4e6,3.511344
4,65dda7775e43b9d7d3e4f4ea,3.482175
...,...,...
98,65dda7855e43b9d7d3e4fead,3.489950
99,65dda7855e43b9d7d3e4fecb,3.532411
100,65dda7855e43b9d7d3e4fee8,3.441251
101,65dda7865e43b9d7d3e4ff06,3.508078


In [43]:
# Merge average rating & rating count
popular_df = vehicles_with_ratings_df.merge(average_rating_df,on='vehicle_id')
popular_df

Unnamed: 0,vehicle_id,no_of_rating,avg_rating
0,65dda7755e43b9d7d3e4f3f9,1221,3.486486
1,65dda7765e43b9d7d3e4f4bf,1176,3.460034
2,65dda7775e43b9d7d3e4f4d3,1193,3.516345
3,65dda7775e43b9d7d3e4f4e6,1146,3.511344
4,65dda7775e43b9d7d3e4f4ea,1122,3.482175
...,...,...,...
98,65dda7855e43b9d7d3e4fead,1194,3.489950
99,65dda7855e43b9d7d3e4fecb,1157,3.532411
100,65dda7855e43b9d7d3e4fee8,1183,3.441251
101,65dda7865e43b9d7d3e4ff06,1238,3.508078


In [49]:
# First sort with 'no_of_rating' and only get 30 or them
popular_df = popular_df.sort_values('no_of_rating',ascending=False).head(30)
popular_df.shape

(30, 3)

In [52]:
# Now sort with 'avg_rating'
popular_df = popular_df.sort_values('avg_rating',ascending=False)
popular_df

Unnamed: 0,vehicle_id,no_of_rating,avg_rating
28,65dda7795e43b9d7d3e4f661,1236,3.536408
62,65dda77e5e43b9d7d3e4fa29,1204,3.524917
76,65dda7815e43b9d7d3e4fc26,1188,3.521044
84,65dda7835e43b9d7d3e4fd25,1198,3.520868
44,65dda77b5e43b9d7d3e4f817,1237,3.520614
83,65dda7835e43b9d7d3e4fd04,1195,3.518828
2,65dda7775e43b9d7d3e4f4d3,1193,3.516345
25,65dda7795e43b9d7d3e4f63b,1191,3.512175
47,65dda77c5e43b9d7d3e4f85c,1229,3.511798
101,65dda7865e43b9d7d3e4ff06,1238,3.508078


In [57]:
popular_vehicles = popular_df["vehicle_id"]
popular_vehicles = popular_vehicles.values.tolist()

In [58]:
popular_vehicles

['65dda7795e43b9d7d3e4f661',
 '65dda77e5e43b9d7d3e4fa29',
 '65dda7815e43b9d7d3e4fc26',
 '65dda7835e43b9d7d3e4fd25',
 '65dda77b5e43b9d7d3e4f817',
 '65dda7835e43b9d7d3e4fd04',
 '65dda7775e43b9d7d3e4f4d3',
 '65dda7795e43b9d7d3e4f63b',
 '65dda77c5e43b9d7d3e4f85c',
 '65dda7865e43b9d7d3e4ff06',
 '65dda77c5e43b9d7d3e4f87f',
 '65dda7805e43b9d7d3e4fb98',
 '65dda7825e43b9d7d3e4fcc7',
 '65dda77a5e43b9d7d3e4f708',
 '65dda77f5e43b9d7d3e4fade',
 '65dda7785e43b9d7d3e4f583',
 '65dda7815e43b9d7d3e4fbbd',
 '65dda77d5e43b9d7d3e4f94f',
 '65dda7855e43b9d7d3e4fe8c',
 '65dda7855e43b9d7d3e4fead',
 '65dda7755e43b9d7d3e4f3f9',
 '65dda7775e43b9d7d3e4f534',
 '65dda7835e43b9d7d3e4fd7f',
 '65dda7785e43b9d7d3e4f5c2',
 '65dda7775e43b9d7d3e4f557',
 '65dda7795e43b9d7d3e4f6a7',
 '65dda77f5e43b9d7d3e4fab9',
 '65dda77a5e43b9d7d3e4f739',
 '65dda7785e43b9d7d3e4f58b',
 '65dda7825e43b9d7d3e4fca6']