In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# DEFINE DATA AS A DICTIONARY.
data_dict = {
    'userId': ['user 1', 'user 2', 'user 3', 'user 4', 'user 5'],
    'action1': [4, 5, 1, np.nan, 1],
    'action2': [5, 3, np.nan, 2, np.nan],
    'action3': [3, 3, np.nan, 1, 2],
    'romantic1': [np.nan, 2, 4, 4, 3],
    'romantic2': [2, 2, 5, np.nan, 3],
    'romantic3': [1, np.nan, 4, 3, 4]
}

# CREATE DATA-FRAME.
rating_data = pd.DataFrame(data_dict)
rating_data.set_index('userId', inplace=True)
rating_data.index.name = None

In [3]:
rating_data

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,4.0,5.0,3.0,,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,
user 3,1.0,,,4.0,5.0,4.0
user 4,,2.0,1.0,4.0,,3.0
user 5,1.0,,2.0,3.0,3.0,4.0


#### Data Preprocessing

In [4]:
# STANDARIZE(NORMALIZE THE DATA AROUND ZERO) ALL THE RATINGS OF A USERS - MAKE 0 AS A MEAN
rating_data = rating_data.fillna(0) # ANY-WAYS ZERO IS GONNA BE MEAN.

def standardize(row):
    # MAKE 0 AS THE A MEAN(RATING).
    new_row = (row - row.mean()) / row.max()

    return new_row

In [5]:
rating_data = rating_data.apply(standardize)

In [6]:
rating_data

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,0.36,0.6,0.4,-0.65,-0.08,-0.35
user 2,0.56,0.2,0.4,-0.15,-0.08,-0.6
user 3,-0.24,-0.4,-0.6,0.35,0.52,0.4
user 4,-0.44,0.0,-0.266667,0.35,-0.48,0.15
user 5,-0.24,-0.4,0.066667,0.1,0.12,0.4


In [7]:
# Transpose of DataFrame - to find similarity of movies, based on ratings of a users.
rating_data = rating_data.T
rating_data

Unnamed: 0,user 1,user 2,user 3,user 4,user 5
action1,0.36,0.56,-0.24,-0.44,-0.24
action2,0.6,0.2,-0.4,0.0,-0.4
action3,0.4,0.4,-0.6,-0.266667,0.066667
romantic1,-0.65,-0.15,0.35,0.35,0.1
romantic2,-0.08,-0.08,0.52,-0.48,0.12
romantic3,-0.35,-0.6,0.4,0.15,0.4


In [8]:
# FIND COSINE-SIMILARITY.
item_similarity = cosine_similarity(rating_data) # ITEM SIMILARTY-MATRIX BASED ON RATINGS OF USERS.
print(item_similarity)

[[ 1.          0.70668875  0.81368151 -0.79941088 -0.02539184 -0.91410609]
 [ 0.70668875  1.          0.72310153 -0.84515425 -0.5189993  -0.84337386]
 [ 0.81368151  0.72310153  1.         -0.84794611 -0.3799803  -0.80218063]
 [-0.79941088 -0.84515425 -0.84794611  1.          0.14803913  0.72374686]
 [-0.02539184 -0.5189993  -0.3799803   0.14803913  1.          0.39393939]
 [-0.91410609 -0.84337386 -0.80218063  0.72374686  0.39393939  1.        ]]


In [9]:
item_similarity_df = pd.DataFrame(item_similarity, index = rating_data.index, columns = rating_data.index)
item_similarity_df

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
action1,1.0,0.706689,0.813682,-0.799411,-0.025392,-0.914106
action2,0.706689,1.0,0.723102,-0.845154,-0.518999,-0.843374
action3,0.813682,0.723102,1.0,-0.847946,-0.37998,-0.802181
romantic1,-0.799411,-0.845154,-0.847946,1.0,0.148039,0.723747
romantic2,-0.025392,-0.518999,-0.37998,0.148039,1.0,0.393939
romantic3,-0.914106,-0.843374,-0.802181,0.723747,0.393939,1.0


In [10]:
# GET SIMILAR MOVIE FUNCTION.
def get_similar_movie(movie_name, user_rating):
    scores = item_similarity_df[movie_name] * (user_rating - 2.5) # LOGIC BRO LOGIC(TAKE TIME UNDERSTAND)
    scores = scores.sort_values(ascending = False)

    return scores

# TEST GET-SIMILAR-MOVIE(METHOD) BASED ON USER-RATING ON SINGLE MOVIE.
# print(get_similar_movie('action1', 4)) # BY GIVING POSITIVE RATING.
print(get_similar_movie('action1', 1)) # BY GIVING NEGITIVE RATING

romantic3    1.371159
romantic1    1.199116
romantic2    0.038088
action2     -1.060033
action3     -1.220522
action1     -1.500000
Name: action1, dtype: float64


In [11]:
# TEST(EVALUATE) GET-SIMILAR-MOVIE METHOD BASED ON USER-RATINGS ON MUTIPLE MOVIES.
action_lover = [("action1",5),("romantic2",1),("romantic3",1)] # LIST OF MOVIES AND RATINGS.
similar_scores = []

for movie, rating in action_lover:
    similar_scores.append(get_similar_movie(movie, rating))

index = [i for i in range(0, len(action_lover))]

similar_scores_df = pd.DataFrame(similar_scores, index = index) # CONVERT LIST TO A DATA-FRAME.
similar_scores_df  # EACH ROW REPRESENTS SIMILARITY SCORES OF ALL MOVIES WITH GIVEN MOVIE-RATING.

Unnamed: 0,action1,action3,action2,romantic2,romantic1,romantic3
0,2.5,2.034204,1.766722,-0.06348,-1.998527,-2.285265
1,0.038088,0.56997,0.778499,-1.5,-0.222059,-0.590909
2,1.371159,1.203271,1.265061,-0.590909,-1.08562,-1.5


In [12]:
# SUM ALL RATINGS(THEY WERE BASED ON COLLECTION OD MOVIES) TO GET OVERALL RECOMENDATIONS.
similar_scores_df.sum().sort_values(ascending = False)

action1      3.909247
action2      3.810282
action3      3.807445
romantic2   -2.154389
romantic1   -3.306206
romantic3   -4.376174
dtype: float64