In [211]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

In [212]:
# Step 1: Load the Data
vehicles_df = pd.read_csv('travels.csv')
reviews_df = pd.read_csv('reviews.csv')

In [213]:
vehicles_df.head()

Unnamed: 0,vehicle_id,travel_id,departure_at,from,to
0,65dda7755e43b9d7d3e4f3f9,65dda7765e43b9d7d3e4f48e,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
1,65dda7765e43b9d7d3e4f4bf,65dda7795e43b9d7d3e4f67d,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
2,65dda7775e43b9d7d3e4f4e6,65dda77a5e43b9d7d3e4f6df,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
3,65dda7775e43b9d7d3e4f529,65dda77a5e43b9d7d3e4f74b,2024-03-03,"Pathri, Morong","Koteshowr, Kathmandu"
4,65dda7775e43b9d7d3e4f534,65dda77a5e43b9d7d3e4f753,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"


In [214]:
reviews_df.rename(columns={"id":'review_id'},inplace=True)

In [215]:
reviews_df.head()

Unnamed: 0,review_id,vehicle_id,user_id,rating
0,65dda7bea276ad551e8d07b2,65dda77d5e43b9d7d3e4f977,91,5
1,65dda7bea276ad551e8d07b3,65dda77f5e43b9d7d3e4fa4d,196,4
2,65dda7bea276ad551e8d07b4,65dda7855e43b9d7d3e4fe8c,180,3
3,65dda7bea276ad551e8d07b5,65dda7795e43b9d7d3e4f6a7,136,4
4,65dda7bea276ad551e8d07b6,65dda77c5e43b9d7d3e4f87f,141,3


In [216]:
print(reviews_df.shape)
print(vehicles_df.shape)

(120000, 4)
(103, 5)


In [217]:
# Step 2: Preprocess the Data
vehicles_with_reviews = pd.merge(reviews_df, vehicles_df, on='vehicle_id')

In [218]:
vehicles_with_reviews

Unnamed: 0,review_id,vehicle_id,user_id,rating,travel_id,departure_at,from,to
0,65dda7bea276ad551e8d07b2,65dda77d5e43b9d7d3e4f977,91,5,65dda7815e43b9d7d3e4fbe6,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
1,65dda7bea276ad551e8d07ff,65dda77d5e43b9d7d3e4f977,54,3,65dda7815e43b9d7d3e4fbe6,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
2,65dda7bea276ad551e8d0806,65dda77d5e43b9d7d3e4f977,154,5,65dda7815e43b9d7d3e4fbe6,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
3,65dda7bea276ad551e8d090c,65dda77d5e43b9d7d3e4f977,55,3,65dda7815e43b9d7d3e4fbe6,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
4,65dda7bea276ad551e8d0926,65dda77d5e43b9d7d3e4f977,192,5,65dda7815e43b9d7d3e4fbe6,2024-03-03,"Biratnagar, Sunsari","Kerkha, Jhapa"
...,...,...,...,...,...,...,...,...
119995,65dda7c7a276ad551e8ed8b7,65dda7835e43b9d7d3e4fd25,45,3,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
119996,65dda7c7a276ad551e8ed9a8,65dda7835e43b9d7d3e4fd25,71,3,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
119997,65dda7c7a276ad551e8ed9bb,65dda7835e43b9d7d3e4fd25,147,3,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
119998,65dda7c7a276ad551e8edbcd,65dda7835e43b9d7d3e4fd25,2,3,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"


In [219]:
# Step 3: Filter Relevant Vehicles
def filter_vehicles(data, from_location, to_location, departure_at):
    filtered_data = data[(data['from'] == from_location) & (data['to'] == to_location) & (data['departure_at'] == departure_at)]
    return filtered_data

searched_vehicles_just_from_vehicles_dataset = filter_vehicles(vehicles_df, "Pokhara, Kaski", "Pathri, Morong", "2024-03-03")
searched_vehicles = filter_vehicles(vehicles_with_reviews, "Pokhara, Kaski", "Pathri, Morong", "2024-03-03")

In [220]:
searched_vehicles_just_from_vehicles_dataset

Unnamed: 0,vehicle_id,travel_id,departure_at,from,to
1,65dda7765e43b9d7d3e4f4bf,65dda7795e43b9d7d3e4f67d,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
6,65dda7775e43b9d7d3e4f54d,65dda77a5e43b9d7d3e4f778,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
12,65dda7785e43b9d7d3e4f5cd,65dda77b5e43b9d7d3e4f82a,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
14,65dda7785e43b9d7d3e4f5e1,65dda77c5e43b9d7d3e4f843,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
15,65dda7785e43b9d7d3e4f610,65dda77c5e43b9d7d3e4f871,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
18,65dda7795e43b9d7d3e4f661,65dda77c5e43b9d7d3e4f8d5,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
21,65dda7775e43b9d7d3e4f4d3,65dda77d5e43b9d7d3e4f971,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
22,65dda77a5e43b9d7d3e4f712,65dda77d5e43b9d7d3e4f988,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
29,65dda7775e43b9d7d3e4f570,65dda77f5e43b9d7d3e4fa87,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
38,65dda77d5e43b9d7d3e4f952,65dda7815e43b9d7d3e4fbc7,2024-03-03,"Pokhara, Kaski","Pathri, Morong"


In [221]:
searched_vehicles

Unnamed: 0,review_id,vehicle_id,user_id,rating,travel_id,departure_at,from,to
1138,65dda7bea276ad551e8d07b3,65dda77f5e43b9d7d3e4fa4d,196,4,65dda7865e43b9d7d3e4ff0d,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
1139,65dda7bea276ad551e8d07f3,65dda77f5e43b9d7d3e4fa4d,101,4,65dda7865e43b9d7d3e4ff0d,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
1140,65dda7bea276ad551e8d0873,65dda77f5e43b9d7d3e4fa4d,181,3,65dda7865e43b9d7d3e4ff0d,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
1141,65dda7bea276ad551e8d08ae,65dda77f5e43b9d7d3e4fa4d,104,5,65dda7865e43b9d7d3e4ff0d,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
1142,65dda7bea276ad551e8d08c8,65dda77f5e43b9d7d3e4fa4d,31,5,65dda7865e43b9d7d3e4ff0d,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
...,...,...,...,...,...,...,...,...
119995,65dda7c7a276ad551e8ed8b7,65dda7835e43b9d7d3e4fd25,45,3,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
119996,65dda7c7a276ad551e8ed9a8,65dda7835e43b9d7d3e4fd25,71,3,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
119997,65dda7c7a276ad551e8ed9bb,65dda7835e43b9d7d3e4fd25,147,3,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
119998,65dda7c7a276ad551e8edbcd,65dda7835e43b9d7d3e4fd25,2,3,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"


In [222]:
# Step 4: Generate Pivot table matrix
user_ratings = searched_vehicles.pivot_table(index='user_id', columns='vehicle_id', values='rating').fillna(0)

In [223]:
user_ratings

vehicle_id,65dda7765e43b9d7d3e4f4bf,65dda7775e43b9d7d3e4f4d3,65dda7775e43b9d7d3e4f54d,65dda7775e43b9d7d3e4f570,65dda7785e43b9d7d3e4f5cd,65dda7785e43b9d7d3e4f5e1,65dda7785e43b9d7d3e4f610,65dda7795e43b9d7d3e4f661,65dda77a5e43b9d7d3e4f712,65dda77b5e43b9d7d3e4f7aa,...,65dda77d5e43b9d7d3e4f952,65dda77e5e43b9d7d3e4f9e4,65dda77f5e43b9d7d3e4fa4d,65dda77f5e43b9d7d3e4fade,65dda7805e43b9d7d3e4fb2a,65dda7835e43b9d7d3e4fd25,65dda7845e43b9d7d3e4fd9d,65dda7845e43b9d7d3e4fdba,65dda7855e43b9d7d3e4fe52,65dda7865e43b9d7d3e4ff06
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.333333,3.500000,2.888889,4.000000,3.444444,3.250000,3.400000,3.200000,4.666667,3.000000,...,3.750000,3.250000,3.666667,3.750000,4.000000,3.200000,4.500000,3.666667,3.200000,3.500000
2,3.000000,3.444444,3.500000,4.500000,4.333333,3.000000,3.777778,3.000000,3.333333,2.600000,...,3.600000,3.375000,3.714286,4.000000,3.000000,3.888889,2.600000,4.000000,3.571429,3.545455
3,3.285714,3.666667,4.666667,3.400000,3.500000,4.250000,3.666667,3.428571,3.333333,4.100000,...,2.833333,3.750000,4.333333,3.666667,3.555556,3.250000,3.000000,3.000000,3.750000,4.000000
4,3.384615,3.833333,3.571429,3.166667,3.800000,3.500000,4.666667,3.250000,4.333333,3.428571,...,3.428571,4.000000,3.400000,3.500000,3.300000,3.000000,3.200000,3.750000,4.000000,3.666667
5,3.333333,4.285714,2.750000,2.500000,3.500000,3.333333,3.500000,4.571429,3.142857,4.000000,...,3.500000,3.000000,3.100000,4.000000,3.571429,3.888889,3.571429,2.666667,3.714286,3.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,3.250000,4.200000,2.500000,3.333333,3.666667,3.181818,4.000000,3.666667,3.000000,3.800000,...,3.000000,4.000000,3.285714,5.000000,3.538462,4.500000,3.400000,3.500000,4.333333,3.833333
197,2.600000,3.500000,3.777778,3.857143,3.600000,3.000000,2.888889,4.666667,3.333333,3.833333,...,3.000000,3.428571,2.000000,3.714286,3.000000,3.500000,3.571429,4.750000,2.000000,3.250000
198,3.000000,3.714286,3.285714,3.600000,3.200000,3.111111,3.000000,3.750000,3.444444,3.400000,...,3.400000,4.285714,3.444444,3.888889,2.400000,3.777778,4.000000,3.500000,4.333333,3.000000
199,4.000000,3.500000,2.666667,3.555556,3.600000,2.000000,3.800000,3.285714,4.500000,3.333333,...,4.250000,3.833333,3.125000,3.500000,3.750000,3.222222,4.000000,3.000000,3.500000,3.615385


In [224]:
# Explanation: https://youtu.be/3ecNC-So0r4?t=785
# Let's say that auth user have not given rating to some vehicle and it doesn't mean that user have gave 0 start rating to it
# So we will create method which will standarized the ratings given by all the users
def standardize(row):
    # takes single rows from dataframe as argument
    # now create a new row from (original rating minus(-) mean of all the ratings & divide it by the range of ratings that the user gives)
    new_row = (row - row.mean()) / (row.max() - row.min()) 
    return new_row

ratings_std = user_ratings.apply(standardize)
ratings_std

vehicle_id,65dda7765e43b9d7d3e4f4bf,65dda7775e43b9d7d3e4f4d3,65dda7775e43b9d7d3e4f54d,65dda7775e43b9d7d3e4f570,65dda7785e43b9d7d3e4f5cd,65dda7785e43b9d7d3e4f5e1,65dda7785e43b9d7d3e4f610,65dda7795e43b9d7d3e4f661,65dda77a5e43b9d7d3e4f712,65dda77b5e43b9d7d3e4f7aa,...,65dda77d5e43b9d7d3e4f952,65dda77e5e43b9d7d3e4f9e4,65dda77f5e43b9d7d3e4fa4d,65dda77f5e43b9d7d3e4fade,65dda7805e43b9d7d3e4fb2a,65dda7835e43b9d7d3e4fd25,65dda7845e43b9d7d3e4fd9d,65dda7845e43b9d7d3e4fdba,65dda7855e43b9d7d3e4fe52,65dda7865e43b9d7d3e4ff06
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.023697,0.002352,-0.201131,0.189809,-0.003636,-0.081120,-0.039153,-0.111128,0.373670,-0.093285,...,0.121297,-0.089624,0.049968,0.078306,0.167364,-0.109431,0.188882,0.039559,-0.091604,0.001961
2,-0.093872,-0.016166,0.002573,0.356475,0.292660,-0.164453,0.086772,-0.177795,-0.070774,-0.173285,...,0.071297,-0.041013,0.065841,0.161639,-0.165969,0.136601,-0.191118,0.150670,0.032206,0.017112
3,-0.033722,0.057908,0.391462,-0.010191,0.014882,0.252214,0.049735,-0.034937,-0.070774,0.126715,...,-0.184259,0.104820,0.272190,0.050528,0.019216,-0.091574,-0.111118,-0.182663,0.091729,0.168627
4,-0.012900,0.113463,0.026382,-0.087969,0.114882,0.002214,0.383069,-0.094461,0.262559,-0.007570,...,0.014154,0.202042,-0.038921,-0.005028,-0.065969,-0.180859,-0.071118,0.067337,0.175063,0.057516
5,-0.023697,0.264257,-0.247427,-0.310191,0.014882,-0.053342,-0.005820,0.346015,-0.134266,0.106715,...,0.037964,-0.186847,-0.138921,0.161639,0.024507,0.136601,0.003168,-0.293774,0.079825,0.001961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,-0.041240,0.235685,-0.330761,-0.032413,0.070438,-0.103847,0.160847,0.044428,-0.181885,0.066715,...,-0.128703,0.202042,-0.077017,0.494972,0.013518,0.354855,-0.031118,-0.015997,0.286174,0.113072
197,-0.178083,0.002352,0.095165,0.142190,0.048216,-0.164453,-0.209524,0.377761,-0.070774,0.073382,...,-0.128703,-0.020180,-0.505588,0.066401,-0.165969,-0.002288,0.003168,0.400670,-0.491604,-0.081373
198,-0.093872,0.073781,-0.068856,0.056475,-0.085118,-0.127416,-0.172487,0.072205,-0.033737,-0.013285,...,0.004630,0.313153,-0.024106,0.124602,-0.365969,0.096918,0.088882,-0.015997,0.286174,-0.164706
199,0.116654,0.002352,-0.275205,0.041661,0.048216,-0.497786,0.094180,-0.082556,0.318115,-0.026618,...,0.287964,0.137227,-0.130588,-0.005028,0.084031,-0.101494,0.088882,-0.182663,0.008396,0.040422


In [225]:
# Step 4: Compute User to User Similarities
# Cosine similaritis over here will calculate over row wise means User to User 
similarity_matrix = cosine_similarity(ratings_std) 

# or if you wanted you can do Vehicles to Vehicles similarites for that you have to transpose
vehicles_similarity_matrix = cosine_similarity(ratings_std.T)

In [226]:
similarity_matrix.shape

(200, 200)

In [227]:
similarity_matrix

array([[ 1.        , -0.0038642 , -0.49353057, ..., -0.28880222,
         0.44279282,  0.23348864],
       [-0.0038642 ,  1.        , -0.02586267, ...,  0.21382942,
         0.11679966,  0.40527538],
       [-0.49353057, -0.02586267,  1.        , ..., -0.00606295,
        -0.42712549, -0.3895902 ],
       ...,
       [-0.28880222,  0.21382942, -0.00606295, ...,  1.        ,
         0.15455422,  0.15279719],
       [ 0.44279282,  0.11679966, -0.42712549, ...,  0.15455422,
         1.        ,  0.39213775],
       [ 0.23348864,  0.40527538, -0.3895902 , ...,  0.15279719,
         0.39213775,  1.        ]])

In [228]:
vehicles_similarity_matrix.shape

(21, 21)

In [229]:
vehicles_similarity_matrix

array([[ 1.00000000e+00, -2.39816086e-02, -9.17330838e-02,
        -1.48229999e-02, -1.39182974e-01,  6.02381005e-02,
         2.81824250e-02,  9.08312786e-03, -4.43892328e-02,
         1.62398957e-02, -3.49024730e-02,  1.60441268e-01,
        -6.02939107e-02, -3.99517681e-02,  2.96227533e-03,
         7.61768784e-02, -7.51750907e-02, -1.99813424e-02,
         4.47399802e-02,  7.76452475e-02,  2.92268831e-02],
       [-2.39816086e-02,  1.00000000e+00, -7.02950336e-02,
        -4.46041993e-02, -1.14280872e-01,  1.60294464e-01,
         1.35011021e-02, -1.53423700e-01, -7.94473738e-02,
        -2.11899023e-02, -1.94608713e-02, -2.63417558e-02,
        -2.96073126e-02, -1.88647406e-02,  1.30918337e-01,
        -2.84710278e-02,  8.90114433e-02,  6.59643415e-02,
        -2.66884130e-02,  2.95714438e-02,  6.42321665e-02],
       [-9.17330838e-02, -7.02950336e-02,  1.00000000e+00,
        -7.41713682e-02,  5.39760209e-02,  8.59092130e-02,
        -3.56126491e-02, -1.47357642e-01,  3.41010635e

In [230]:
# Creating dataframe out of numpy array from vehicles_similarity_matrix so that we can use mor easily
vehicles_similarity_matrix_df = pd.DataFrame(vehicles_similarity_matrix, index=ratings_std.columns, columns=ratings_std.columns)
vehicles_similarity_matrix_df
# Here '65dda7765e43b9d7d3e4f4bf' similar to '65dda7765e43b9d7d3e4f4bf' with 100%
# '65dda7765e43b9d7d3e4f4bf' similar to '65dda7775e43b9d7d3e4f4d3' with 97%
# Explanation: https://youtu.be/3ecNC-So0r4?t=1072

vehicle_id,65dda7765e43b9d7d3e4f4bf,65dda7775e43b9d7d3e4f4d3,65dda7775e43b9d7d3e4f54d,65dda7775e43b9d7d3e4f570,65dda7785e43b9d7d3e4f5cd,65dda7785e43b9d7d3e4f5e1,65dda7785e43b9d7d3e4f610,65dda7795e43b9d7d3e4f661,65dda77a5e43b9d7d3e4f712,65dda77b5e43b9d7d3e4f7aa,...,65dda77d5e43b9d7d3e4f952,65dda77e5e43b9d7d3e4f9e4,65dda77f5e43b9d7d3e4fa4d,65dda77f5e43b9d7d3e4fade,65dda7805e43b9d7d3e4fb2a,65dda7835e43b9d7d3e4fd25,65dda7845e43b9d7d3e4fd9d,65dda7845e43b9d7d3e4fdba,65dda7855e43b9d7d3e4fe52,65dda7865e43b9d7d3e4ff06
vehicle_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
65dda7765e43b9d7d3e4f4bf,1.0,-0.023982,-0.091733,-0.014823,-0.139183,0.060238,0.028182,0.009083,-0.044389,0.01624,...,0.160441,-0.060294,-0.039952,0.002962,0.076177,-0.075175,-0.019981,0.04474,0.077645,0.029227
65dda7775e43b9d7d3e4f4d3,-0.023982,1.0,-0.070295,-0.044604,-0.114281,0.160294,0.013501,-0.153424,-0.079447,-0.02119,...,-0.026342,-0.029607,-0.018865,0.130918,-0.028471,0.089011,0.065964,-0.026688,0.029571,0.064232
65dda7775e43b9d7d3e4f54d,-0.091733,-0.070295,1.0,-0.074171,0.053976,0.085909,-0.035613,-0.147358,0.034101,-0.1,...,-0.141526,0.008205,0.109556,-0.077764,0.106607,0.075688,-0.107364,0.021554,-0.048431,-0.020774
65dda7775e43b9d7d3e4f570,-0.014823,-0.044604,-0.074171,1.0,-0.033157,0.097811,0.013773,0.030333,-0.024839,-0.069453,...,0.110569,0.092956,-0.022965,0.049377,0.077311,0.004734,-0.01637,0.021003,-0.045623,-0.087763
65dda7785e43b9d7d3e4f5cd,-0.139183,-0.114281,0.053976,-0.033157,1.0,-0.110595,0.044291,-0.020462,0.137772,-0.118315,...,0.045936,-0.031929,0.012226,-0.048062,-0.096885,0.003104,0.06626,-0.037092,-0.024131,0.088088
65dda7785e43b9d7d3e4f5e1,0.060238,0.160294,0.085909,0.097811,-0.110595,1.0,0.059096,-0.133237,-0.073474,0.115219,...,-0.167956,-0.015345,0.029963,-0.00294,0.063099,0.001033,-0.058455,-0.070482,0.01927,0.112087
65dda7785e43b9d7d3e4f610,0.028182,0.013501,-0.035613,0.013773,0.044291,0.059096,1.0,-0.107264,0.067717,-0.043268,...,-0.059802,0.022128,-0.000197,-0.043164,-0.046304,-0.040061,0.078491,0.045636,0.138844,-0.024929
65dda7795e43b9d7d3e4f661,0.009083,-0.153424,-0.147358,0.030333,-0.020462,-0.133237,-0.107264,1.0,-0.058638,0.026004,...,0.040873,-0.003102,0.062249,0.04483,-0.070889,-0.000465,0.002368,0.03211,-0.02128,-0.047182
65dda77a5e43b9d7d3e4f712,-0.044389,-0.079447,0.034101,-0.024839,0.137772,-0.073474,0.067717,-0.058638,1.0,-0.050202,...,0.066843,0.072857,-0.130737,-0.02867,0.059806,-0.087734,0.066762,-0.085613,0.007188,0.041671
65dda77b5e43b9d7d3e4f7aa,0.01624,-0.02119,-0.1,-0.069453,-0.118315,0.115219,-0.043268,0.026004,-0.050202,1.0,...,-0.086986,-0.063361,-0.109689,0.0873,-0.031057,0.02649,0.141628,-0.085574,-0.122476,0.155481


In [231]:
# NOTE: bellow here are all the code explanation that is done on 'get_recommendations' function
# filters the DataFrame searched_vehicles to get only the rows where the user_id matches the given user_id 8. It extracts all reviews made by the user.
user_id = 8
user_reviews = searched_vehicles[searched_vehicles['user_id'] == user_id]
user_reviews

Unnamed: 0,review_id,vehicle_id,user_id,rating,travel_id,departure_at,from,to
1150,65dda7bea276ad551e8d0b99,65dda77f5e43b9d7d3e4fa4d,8,3,65dda7865e43b9d7d3e4ff0d,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
1304,65dda7bea276ad551e8d49d7,65dda77f5e43b9d7d3e4fa4d,8,3,65dda7865e43b9d7d3e4ff0d,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
1326,65dda7bea276ad551e8d531d,65dda77f5e43b9d7d3e4fa4d,8,2,65dda7865e43b9d7d3e4ff0d,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
1977,65dda7c5a276ad551e8e658a,65dda77f5e43b9d7d3e4fa4d,8,2,65dda7865e43b9d7d3e4ff0d,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
2129,65dda7c5a276ad551e8ea126,65dda77f5e43b9d7d3e4fa4d,8,3,65dda7865e43b9d7d3e4ff0d,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
...,...,...,...,...,...,...,...,...
119070,65dda7bea276ad551e8d71d4,65dda7835e43b9d7d3e4fd25,8,3,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
119127,65dda7c0a276ad551e8d899d,65dda7835e43b9d7d3e4fd25,8,5,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
119320,65dda7c0a276ad551e8dd654,65dda7835e43b9d7d3e4fd25,8,2,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"
119392,65dda7c3a276ad551e8dedfc,65dda7835e43b9d7d3e4fd25,8,4,65dda7865e43b9d7d3e4ff17,2024-03-03,"Pokhara, Kaski","Pathri, Morong"


In [232]:
# This line creates a Pandas Series called user_ratings. 
# It extracts the rating values from the user_reviews DataFrame and sets the vehicle_id values as the index of the Series. 
# This Series represents the ratings given by the user to different vehicles.
user_ratings = pd.Series(user_reviews.rating.values, index=user_reviews.vehicle_id.values)
# here user could have rated the same vehicle multiple times so, we will going to take an average of that
user_ratings = user_reviews.groupby('vehicle_id')['rating'].mean()
user_ratings.shape

(21,)

In [233]:
# geting round values to convert it into integer
# user_ratings = user_ratings.round().astype(int)
user_ratings

vehicle_id
65dda7765e43b9d7d3e4f4bf    3.600000
65dda7775e43b9d7d3e4f4d3    3.166667
65dda7775e43b9d7d3e4f54d    3.083333
65dda7775e43b9d7d3e4f570    3.000000
65dda7785e43b9d7d3e4f5cd    3.444444
65dda7785e43b9d7d3e4f5e1    4.166667
65dda7785e43b9d7d3e4f610    3.444444
65dda7795e43b9d7d3e4f661    3.285714
65dda77a5e43b9d7d3e4f712    3.500000
65dda77b5e43b9d7d3e4f7aa    3.333333
65dda77c5e43b9d7d3e4f839    3.888889
65dda77d5e43b9d7d3e4f952    4.200000
65dda77e5e43b9d7d3e4f9e4    4.250000
65dda77f5e43b9d7d3e4fa4d    2.600000
65dda77f5e43b9d7d3e4fade    2.500000
65dda7805e43b9d7d3e4fb2a    3.500000
65dda7835e43b9d7d3e4fd25    3.400000
65dda7845e43b9d7d3e4fd9d    3.600000
65dda7845e43b9d7d3e4fdba    3.666667
65dda7855e43b9d7d3e4fe52    3.500000
65dda7865e43b9d7d3e4ff06    3.600000
Name: rating, dtype: float64

In [234]:
## Let's make recommendations
# This function taks vehicle_id & the rating that the auth user ex '8' have given on the vehicle_id on the past
def get_similar_vehicles(vehicle_id,user_rating):
    # let's get row of datafrom from 'vehicles_similarity_matrix_df' using the index 'vehicle_id' 
    # which means get row from the vehicle which user have already rated
    similar_score = vehicles_similarity_matrix_df[vehicle_id]
    # Now scale it by the rating that the user have given for that particular vehicle
#     similar_score = similar_score*user_rating
#     now arranging it in decending order
    similar_score = similar_score*(user_rating-2.5) # explanation is bellow next cell from this cell
    similar_score = similar_score.sort_values(ascending=False)
    # This method will return a similarity score for all the vehicles that is similar to the given vehicle 'vehicle_id'
    return similar_score



In [235]:
# Testing for the first user rated vehicle
first_vehicle_id = user_ratings.index[0]
first_vehicle_rating = user_ratings.iloc[0]


get_similar_vehicles(first_vehicle_id, first_vehicle_rating)
# Let's say that user give rating 1 to vehicle 'first_vehicle_id' 
# Explanation: https://youtu.be/3ecNC-So0r4?t=1225
# If the user have rated the vehicles bad then we want all other similar vehicles to go down in the list
# and other vehicle that is not similar to this vehicle to come at the top of the list
# we can achieve this in above function like this:
# similar_score = similar_score*(user_rating-2.5)
get_similar_vehicles(first_vehicle_id, 1)

vehicle_id
65dda7785e43b9d7d3e4f5cd    0.208774
65dda7775e43b9d7d3e4f54d    0.137600
65dda7835e43b9d7d3e4fd25    0.112763
65dda77e5e43b9d7d3e4f9e4    0.090441
65dda77a5e43b9d7d3e4f712    0.066584
65dda77f5e43b9d7d3e4fa4d    0.059928
65dda77c5e43b9d7d3e4f839    0.052354
65dda7775e43b9d7d3e4f4d3    0.035972
65dda7845e43b9d7d3e4fd9d    0.029972
65dda7775e43b9d7d3e4f570    0.022234
65dda77f5e43b9d7d3e4fade   -0.004443
65dda7795e43b9d7d3e4f661   -0.013625
65dda77b5e43b9d7d3e4f7aa   -0.024360
65dda7785e43b9d7d3e4f610   -0.042274
65dda7865e43b9d7d3e4ff06   -0.043840
65dda7845e43b9d7d3e4fdba   -0.067110
65dda7785e43b9d7d3e4f5e1   -0.090357
65dda7805e43b9d7d3e4fb2a   -0.114265
65dda7855e43b9d7d3e4fe52   -0.116468
65dda77d5e43b9d7d3e4f952   -0.240662
65dda7765e43b9d7d3e4f4bf   -1.500000
Name: 65dda7765e43b9d7d3e4f4bf, dtype: float64

In [236]:
# converting 'user_ratings' into list of touples
user_ratings_list = list(user_ratings.items())
user_ratings_list

[('65dda7765e43b9d7d3e4f4bf', 3.6),
 ('65dda7775e43b9d7d3e4f4d3', 3.1666666666666665),
 ('65dda7775e43b9d7d3e4f54d', 3.0833333333333335),
 ('65dda7775e43b9d7d3e4f570', 3.0),
 ('65dda7785e43b9d7d3e4f5cd', 3.4444444444444446),
 ('65dda7785e43b9d7d3e4f5e1', 4.166666666666667),
 ('65dda7785e43b9d7d3e4f610', 3.4444444444444446),
 ('65dda7795e43b9d7d3e4f661', 3.2857142857142856),
 ('65dda77a5e43b9d7d3e4f712', 3.5),
 ('65dda77b5e43b9d7d3e4f7aa', 3.3333333333333335),
 ('65dda77c5e43b9d7d3e4f839', 3.888888888888889),
 ('65dda77d5e43b9d7d3e4f952', 4.2),
 ('65dda77e5e43b9d7d3e4f9e4', 4.25),
 ('65dda77f5e43b9d7d3e4fa4d', 2.6),
 ('65dda77f5e43b9d7d3e4fade', 2.5),
 ('65dda7805e43b9d7d3e4fb2a', 3.5),
 ('65dda7835e43b9d7d3e4fd25', 3.4),
 ('65dda7845e43b9d7d3e4fd9d', 3.6),
 ('65dda7845e43b9d7d3e4fdba', 3.6666666666666665),
 ('65dda7855e43b9d7d3e4fe52', 3.5),
 ('65dda7865e43b9d7d3e4ff06', 3.6)]

In [237]:
# Now we now all the vehicle that auth user have rated is stored on 'user_ratings'

similar_vehicles_df = pd.DataFrame()

# Now we will going to get similarities from every vehicle that user have rated to every other vehicles
for vehicle_id, rating in user_ratings_list:
    similar_vehicles_df[vehicle_id] = get_similar_vehicles(vehicle_id, rating)
    
# Set the index to be the default integer index
similar_vehicles_df.reset_index(drop=True, inplace=True)
# Each row in this dataframe for each 'vehicle_id' that the user has rated
# And the value in each column for vehicle id is the similarity score for each row index 'vehicle_id'
# It means that every value in this dataframe are the similarity value for each vehicle that user have rated on
similar_vehicles_df

Unnamed: 0,65dda7765e43b9d7d3e4f4bf,65dda7775e43b9d7d3e4f4d3,65dda7775e43b9d7d3e4f54d,65dda7775e43b9d7d3e4f570,65dda7785e43b9d7d3e4f5cd,65dda7785e43b9d7d3e4f5e1,65dda7785e43b9d7d3e4f610,65dda7795e43b9d7d3e4f661,65dda77a5e43b9d7d3e4f712,65dda77b5e43b9d7d3e4f7aa,...,65dda77d5e43b9d7d3e4f952,65dda77e5e43b9d7d3e4f9e4,65dda77f5e43b9d7d3e4fa4d,65dda77f5e43b9d7d3e4fade,65dda7805e43b9d7d3e4fb2a,65dda7835e43b9d7d3e4fd25,65dda7845e43b9d7d3e4fd9d,65dda7845e43b9d7d3e4fdba,65dda7855e43b9d7d3e4fe52,65dda7865e43b9d7d3e4ff06
0,1.1,-0.015988,-0.053511,-0.007411,-0.131451,0.100397,0.026617,0.007137,-0.044389,0.013533,...,0.27275,-0.105514,-0.003995,0.0,0.076177,-0.067658,-0.021979,0.052197,0.077645,0.03215
1,0.176485,-0.017561,-0.082557,0.055285,0.043384,-0.279927,-0.05648,0.032115,0.066843,-0.072488,...,1.7,0.212408,0.011623,-0.0,-0.026021,-0.034385,0.047114,-0.186062,-0.110104,-0.065414
2,0.08541,0.019714,-0.028251,-0.022811,-0.02279,0.032117,0.13113,-0.01672,0.007188,-0.102063,...,-0.187177,-0.169875,8e-05,-0.0,0.056146,0.03338,-0.036892,0.070941,1.0,-0.054904
3,0.083795,-0.018981,0.062187,0.038655,-0.091503,0.105164,-0.043732,-0.055698,0.059806,-0.025881,...,-0.044236,0.129209,-0.001418,-0.0,1.0,-0.005907,0.021008,0.058528,0.056146,-0.141067
4,0.066262,0.106863,0.050114,0.048905,-0.10445,1.666667,0.055813,-0.104686,-0.073474,0.096016,...,-0.285525,-0.026853,0.002996,-0.0,0.063099,0.000929,-0.0643,-0.082228,0.01927,0.123296
5,0.049214,-0.017792,0.012573,0.010501,-0.035031,-0.117469,0.043101,0.02523,-0.085613,-0.071312,...,-0.271118,0.227789,0.003733,-0.0,0.050167,-0.029101,0.056294,1.166667,0.060806,0.065984
6,0.03215,0.042821,-0.012118,-0.043882,0.083194,0.186811,-0.023544,-0.037072,0.041671,0.129567,...,-0.101094,0.114232,-0.012416,-0.0,-0.128243,0.079035,-0.121707,0.069983,-0.049913,1.1
7,0.031001,0.009001,-0.020774,0.006886,0.04183,0.098493,0.944444,-0.084279,0.067717,-0.036057,...,-0.101664,0.038725,-2e-05,-0.0,-0.046304,-0.036055,0.08634,0.053242,0.138844,-0.027422
8,0.017864,-0.014127,-0.058333,-0.034726,-0.111742,0.192031,-0.040864,0.020431,-0.050202,0.833333,...,-0.147876,-0.110882,-0.010969,0.0,-0.031057,0.023841,0.155791,-0.099836,-0.122476,0.171029
9,0.009991,-0.102282,-0.085959,0.015167,-0.019325,-0.222062,-0.101305,0.785714,-0.058638,0.02167,...,0.069485,-0.005428,0.006225,0.0,-0.070889,-0.000418,0.002605,0.037462,-0.02128,-0.051901


In [238]:
# Now let's sum all the vehicle id values row wise and sort them in decending order
similar_vehicles = similar_vehicles_df.sum().sort_values(ascending=False)
similar_vehicles

65dda77e5e43b9d7d3e4f9e4    2.005644
65dda7785e43b9d7d3e4f5e1    1.868311
65dda77d5e43b9d7d3e4f952    1.360929
65dda7845e43b9d7d3e4fdba    1.320620
65dda77c5e43b9d7d3e4f839    1.309666
65dda7865e43b9d7d3e4ff06    1.148248
65dda7845e43b9d7d3e4fd9d    1.109665
65dda7805e43b9d7d3e4fb2a    1.071969
65dda7765e43b9d7d3e4f4bf    1.056574
65dda7835e43b9d7d3e4fd25    1.014772
65dda77a5e43b9d7d3e4f712    1.003206
65dda7785e43b9d7d3e4f610    0.958578
65dda7855e43b9d7d3e4fe52    0.899528
65dda7785e43b9d7d3e4f5cd    0.671710
65dda77b5e43b9d7d3e4f7aa    0.631290
65dda7775e43b9d7d3e4f4d3    0.597891
65dda7775e43b9d7d3e4f570    0.527052
65dda7775e43b9d7d3e4f54d    0.368237
65dda7795e43b9d7d3e4f661    0.322840
65dda77f5e43b9d7d3e4fa4d    0.096232
65dda77f5e43b9d7d3e4fade    0.000000
dtype: float64

In [242]:
recommended_vehicles = similar_vehicles.index.tolist()

In [243]:
recommended_vehicles

['65dda77e5e43b9d7d3e4f9e4',
 '65dda7785e43b9d7d3e4f5e1',
 '65dda77d5e43b9d7d3e4f952',
 '65dda7845e43b9d7d3e4fdba',
 '65dda77c5e43b9d7d3e4f839',
 '65dda7865e43b9d7d3e4ff06',
 '65dda7845e43b9d7d3e4fd9d',
 '65dda7805e43b9d7d3e4fb2a',
 '65dda7765e43b9d7d3e4f4bf',
 '65dda7835e43b9d7d3e4fd25',
 '65dda77a5e43b9d7d3e4f712',
 '65dda7785e43b9d7d3e4f610',
 '65dda7855e43b9d7d3e4fe52',
 '65dda7785e43b9d7d3e4f5cd',
 '65dda77b5e43b9d7d3e4f7aa',
 '65dda7775e43b9d7d3e4f4d3',
 '65dda7775e43b9d7d3e4f570',
 '65dda7775e43b9d7d3e4f54d',
 '65dda7795e43b9d7d3e4f661',
 '65dda77f5e43b9d7d3e4fa4d',
 '65dda77f5e43b9d7d3e4fade']

In [None]:
# NOTE: Issues that we could face on Collaborative Filtering
# 1. Handling Unknown Users / Vehicles (Cold Start Problem) where we don't know any context about it for rignt now
# 2. Scalalability: Takes havy performance to perform this model
# 3. Dynamic Update: When ever we will add new vehicle or User or Rating Data Then we again have to re build this model
# For that we can create a Hybrid recommendation system to handle this challanges
# Also we can use Papularity based filtering