**NOTE :** Change paths accordingly wherever needed.

**Mount Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Necessary Imports**

In [None]:
!pip install pymongo

Collecting pymongo
  Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.10.1


In [None]:
from pymongo import MongoClient  #For connecting to mongodb atlas
from pyspark.sql import SparkSession  #For creating the spark session
import numpy as np
import scipy.sparse
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

**Connecting to MongoDb Atlas to access database**

In [None]:
# ENTER YOUR CONNECTION STRING HERE (FROM MONGODB ATLAS)
#I'm mentioning its general format below
connection_string= "mongodb+srv://<username>:<password>@<cluster-name>.mongodb.net/<database>?retryWrites=true&w=majority"

In [None]:
client = MongoClient(connection_string)

In [None]:
db = client["movie_recommendation"]
print(db.list_collection_names())

['occupations', 'item', 'info', 'genre', 'ratings', 'users']


**Setting Up Pyspark**

In [None]:
# Install PySpark
!pip install pyspark



In [None]:
!wget https://repo1.maven.org/maven2/org/mongodb/mongo-java-driver/3.12.10/mongo-java-driver-3.12.10.jar

--2024-11-14 15:49:38--  https://repo1.maven.org/maven2/org/mongodb/mongo-java-driver/3.12.10/mongo-java-driver-3.12.10.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209, 2a04:4e42:4c::209, ...
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2314569 (2.2M) [application/java-archive]
Saving to: ‘mongo-java-driver-3.12.10.jar’


2024-11-14 15:49:39 (28.8 MB/s) - ‘mongo-java-driver-3.12.10.jar’ saved [2314569/2314569]



In [None]:
# Connect pyspark with mongodb
!wget https://repo1.maven.org/maven2/org/mongodb/spark/mongo-spark-connector_2.12/3.0.1/mongo-spark-connector_2.12-3.0.1.jar

--2024-11-14 15:49:40--  https://repo1.maven.org/maven2/org/mongodb/spark/mongo-spark-connector_2.12/3.0.1/mongo-spark-connector_2.12-3.0.1.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209, 2a04:4e42:4c::209, ...
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 552150 (539K) [application/java-archive]
Saving to: ‘mongo-spark-connector_2.12-3.0.1.jar’


2024-11-14 15:49:40 (11.2 MB/s) - ‘mongo-spark-connector_2.12-3.0.1.jar’ saved [552150/552150]



In [None]:
!ls /content/mongo-spark-connector_2.12-3.0.1.jar

/content/mongo-spark-connector_2.12-3.0.1.jar


**Creating a Spark Session**

In [None]:
mongo_spark_connector = "/content/mongo-spark-connector_2.12-3.0.1.jar"
mongo_java_driver = "/content/mongo-java-driver-3.12.10.jar"
# Initialize SparkSession with MongoDB
#CHANGE THE PATHS ACCORDINGLY
spark = SparkSession.builder \
    .appName("YourAppName") \
    .config("spark.mongodb.read.connection.uri", "mongodb+srv://<username>:<password>@<cluster-name>.mongodb.net/<database-name>") \
    .config("spark.mongodb.write.connection.uri", "mongodb+srv://<username>:<password>@<cluster-name>.mongodb.net/<database-name>") \
    .config("spark.jars", f"{mongo_spark_connector},{mongo_java_driver}") \
    .getOrCreate()

In [None]:
# Verify Spark setup
print(spark)
print(spark.version)

<pyspark.sql.session.SparkSession object at 0x7f564bfb9660>
3.5.3


**Creating Spark Dataframe**

In [None]:
# Read data from MongoDB collections with connection URI in options
ratings = spark.read.format("mongo") \
    .option("uri", "mongodb+srv://<username>:<password>@<cluster-name>.mongodb.net/<database-name>.ratings") \
    .load()

items = spark.read.format("mongo") \
    .option("uri", "mongodb+srv://<username>:<password>@<cluster-name>.mongodb.net/<database-name>.item") \
    .load()

In [None]:
ratings.show()
items.show()

+--------------------+-------+------+---------+-------+
|                 _id|item_id|rating|timestamp|user_id|
+--------------------+-------+------+---------+-------+
|{670d70421c928007...|    242|     3|881250949|    196|
|{670d70421c928007...|    302|     3|891717742|    186|
|{670d70421c928007...|    377|     1|878887116|     22|
|{670d70421c928007...|     51|     2|880606923|    244|
|{670d70421c928007...|    346|     1|886397596|    166|
|{670d70421c928007...|    474|     4|884182806|    298|
|{670d70421c928007...|    265|     2|881171488|    115|
|{670d70421c928007...|    465|     5|891628467|    253|
|{670d70421c928007...|    451|     3|886324817|    305|
|{670d70421c928007...|     86|     3|883603013|      6|
|{670d70421c928007...|    257|     2|879372434|     62|
|{670d70421c928007...|   1014|     5|879781125|    286|
|{670d70421c928007...|    222|     5|876042340|    200|
|{670d70421c928007...|     40|     3|891035994|    210|
|{670d70421c928007...|     29|     3|888104457| 

In [None]:
ratings = ratings.withColumnRenamed("item_id", "movie_id")
ratings.show(3)

+--------------------+--------+------+---------+-------+
|                 _id|movie_id|rating|timestamp|user_id|
+--------------------+--------+------+---------+-------+
|{670d70421c928007...|     242|     3|881250949|    196|
|{670d70421c928007...|     302|     3|891717742|    186|
|{670d70421c928007...|     377|     1|878887116|     22|
+--------------------+--------+------+---------+-------+
only showing top 3 rows



**Dropping Unnecessary Columns**

In [None]:
rating_cols_to_drop = ["timestamp","_id"]
item_cols_to_drop = ["Action","Adventure","Animation","Children's","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir",
    "Horror","IMDb_url","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western","_id","release_date","unknown"]

ratings=ratings.drop(*rating_cols_to_drop)
items=items.drop(*item_cols_to_drop)

ratings.show(3)
items.show(3)

+--------+------+-------+
|movie_id|rating|user_id|
+--------+------+-------+
|     242|     3|    196|
|     302|     3|    186|
|     377|     1|     22|
+--------+------+-------+
only showing top 3 rows

+--------+-----------------+
|movie_id|            title|
+--------+-----------------+
|       1| Toy Story (1995)|
|       2| GoldenEye (1995)|
|       3|Four Rooms (1995)|
+--------+-----------------+
only showing top 3 rows



**Performing a JOIN Operation**

In [None]:
# Perform the join
ratings = ratings.join(items, on="movie_id", how="inner")

In [None]:
ratings.show(3)

+--------+------+-------+--------------------+
|movie_id|rating|user_id|               title|
+--------+------+-------+--------------------+
|     496|     4|    250|It's a Wonderful ...|
|     471|     3|    293|Courage Under Fir...|
|     471|     4|    305|Courage Under Fir...|
+--------+------+-------+--------------------+
only showing top 3 rows



In [None]:
print("Ratings")
print(ratings.count())
print(len(ratings.columns))

Ratings
100000
4


In [None]:
ratings_df = ratings.dropna()
print(ratings_df.count())

100000


**Get Unique Users & Storing it as a List**

In [None]:
# Get unique User_id values
user_ids = ratings_df.select("user_id").distinct().collect()

# Convert to a list
user_ids_list = [row.user_id for row in user_ids]

unique_user_count = len(user_ids)
print(unique_user_count)

943


In [None]:
# Print the user_ids and user_ids_list
print(user_ids)
print(user_ids_list)

[Row(user_id=148), Row(user_id=496), Row(user_id=463), Row(user_id=833), Row(user_id=471), Row(user_id=392), Row(user_id=540), Row(user_id=897), Row(user_id=243), Row(user_id=623), Row(user_id=737), Row(user_id=858), Row(user_id=31), Row(user_id=516), Row(user_id=251), Row(user_id=580), Row(user_id=85), Row(user_id=451), Row(user_id=137), Row(user_id=808), Row(user_id=458), Row(user_id=883), Row(user_id=65), Row(user_id=879), Row(user_id=255), Row(user_id=588), Row(user_id=804), Row(user_id=898), Row(user_id=481), Row(user_id=799), Row(user_id=53), Row(user_id=472), Row(user_id=133), Row(user_id=853), Row(user_id=296), Row(user_id=918), Row(user_id=78), Row(user_id=513), Row(user_id=322), Row(user_id=321), Row(user_id=613), Row(user_id=633), Row(user_id=362), Row(user_id=857), Row(user_id=673), Row(user_id=593), Row(user_id=597), Row(user_id=375), Row(user_id=876), Row(user_id=108), Row(user_id=683), Row(user_id=744), Row(user_id=155), Row(user_id=642), Row(user_id=796), Row(user_id=21

**Get Unique Titles & Storing it as a List**

In [None]:
# Get unique titles values
title_ids = ratings_df.select("title").distinct().collect()

# Convert to a list
title_ids_list = [row.title for row in title_ids]

unique_title_count = len(title_ids)
print(unique_title_count)

1664


In [None]:
print(title_ids)
print(title_ids_list)

[Row(title='Cosi (1996)'), Row(title='Psycho (1960)'), Row(title='Three Wishes (1995)'), Row(title='If Lucy Fell (1996)'), Row(title='When We Were Kings (1996)'), Row(title='Annie Hall (1977)'), Row(title='Fair Game (1995)'), Row(title='Heavenly Creatures (1994)'), Row(title='Paris, France (1993)'), Row(title='Snow White and the Seven Dwarfs (1937)'), Row(title='Night of the Living Dead (1968)'), Row(title="I'll Do Anything (1994)"), Row(title='Spanking the Monkey (1994)'), Row(title='Mondo (1996)'), Row(title='Threesome (1994)'), Row(title='Blue Chips (1994)'), Row(title='Colonel Chabert, Le (1994)'), Row(title='Reality Bites (1994)'), Row(title='A Chef in Love (1996)'), Row(title='Last Action Hero (1993)'), Row(title='Nico Icon (1995)'), Row(title='Evil Dead II (1987)'), Row(title='Crows and Sparrows (1949)'), Row(title='Picture Bride (1995)'), Row(title='Rebecca (1940)'), Row(title='English Patient, The (1996)'), Row(title='Inventing the Abbotts (1997)'), Row(title='Jaws 3-D (1983)'

**Creating a List of Ratings**

In [None]:
ratings_list = ratings_df.select('rating').rdd.flatMap(lambda x: x).collect()
print(len(ratings_list))

100000


**Create mappings with indexes**

In [None]:
# Create mapping of User ID to index
user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids_list)}

# Create mapping of User ID to index
title_id_to_index = {title: index for index, title in enumerate(title_ids_list)}

**Creating the Sparse Matrix**

In [None]:
# Lists to hold row indices, column indices, and data (ratings)
rows = []
cols = []
data = []

# Assuming you have pairs of user_id, title, and rating
# Iterate over the DataFrame to populate rows, cols, and data
for row in ratings_df.collect():
    user_id = row.user_id
    title = row.title  # Assuming the column for movie title is named 'title'
    rating = row.rating

    # Get the row (user) and column (title) index
    row_index = user_id_to_index[user_id]
    col_index = title_id_to_index[title]

    # Append the indices and data
    rows.append(row_index)
    cols.append(col_index)
    data.append(rating)

# Convert to a CSR matrix
sparse_matrix = csr_matrix((data, (rows, cols)), shape=(unique_user_count, unique_title_count))

# Now 'sparse_matrix' is in CSR format
print(sparse_matrix)

  (0, 1)	1
  (0, 54)	5
  (0, 68)	5
  (0, 70)	5
  (0, 77)	4
  (0, 96)	5
  (0, 97)	4
  (0, 126)	4
  (0, 145)	4
  (0, 216)	5
  (0, 236)	2
  (0, 251)	5
  (0, 254)	5
  (0, 288)	5
  (0, 306)	4
  (0, 325)	4
  (0, 360)	5
  (0, 438)	5
  (0, 454)	4
  (0, 488)	5
  (0, 503)	3
  (0, 535)	5
  (0, 567)	5
  (0, 568)	3
  (0, 575)	1
  :	:
  (942, 582)	4
  (942, 608)	5
  (942, 633)	1
  (942, 652)	3
  (942, 699)	4
  (942, 940)	1
  (942, 943)	3
  (942, 957)	2
  (942, 963)	5
  (942, 976)	3
  (942, 997)	3
  (942, 1000)	5
  (942, 1091)	3
  (942, 1193)	4
  (942, 1261)	3
  (942, 1286)	4
  (942, 1303)	2
  (942, 1331)	1
  (942, 1419)	2
  (942, 1426)	4
  (942, 1453)	5
  (942, 1558)	4
  (942, 1563)	4
  (942, 1652)	3
  (942, 1662)	4


In [None]:
sparse_matrix.shape

(943, 1664)

In [None]:
print(type(sparse_matrix))

<class 'scipy.sparse._csr.csr_matrix'>


**Creating User & Item Similarity Matrix**

In [None]:
# Calculate user similarity (similarity between users)
user_similarity_matrix = cosine_similarity(sparse_matrix, dense_output=False)

# Calculate item similarity (similarity between items)
item_similarity_matrix = cosine_similarity(sparse_matrix.T, dense_output=False)

In [None]:
# Now you have two sparse matrices: one for user similarity and one for item similarity
print("User Similarity Matrix:")
print(user_similarity_matrix)

print("Item Similarity Matrix:")
print(item_similarity_matrix)

# Check types to confirm they are sparse matrices
print(type(user_similarity_matrix))  # Should be a sparse matrix
print(type(item_similarity_matrix))  #

User Similarity Matrix:
  (0, 606)	0.013537167363818485
  (0, 399)	0.024468681046207
  (0, 250)	0.02561258300815488
  (0, 166)	0.03021465199054143
  (0, 812)	0.025994540711622965
  (0, 190)	0.016027662685859267
  (0, 422)	0.003961942124003717
  (0, 320)	0.005805276782420941
  (0, 11)	0.027090115072118157
  (0, 926)	0.031208274635283514
  (0, 847)	0.025352309822338645
  (0, 826)	0.023448213826151306
  (0, 776)	0.029681791641080278
  (0, 663)	0.06882402356526499
  (0, 570)	0.02227505061206141
  (0, 201)	0.04243890461109989
  (0, 161)	0.017080842512411352
  (0, 896)	0.013794415702436216
  (0, 785)	0.02004736774793867
  (0, 725)	0.009592353720958518
  (0, 513)	0.0373766889972957
  (0, 933)	0.05279741247395289
  (0, 843)	0.044124827116409186
  (0, 819)	0.05798295088033915
  (0, 793)	0.10918418805480452
  :	:
  (942, 253)	0.10883807195902806
  (942, 252)	0.19726947583019458
  (942, 243)	0.19102176308130314
  (942, 233)	0.1370696537410805
  (942, 231)	0.16403319842315492
  (942, 221)	0.146866

**Saving the matrices in .NPZ format in drive**

In [None]:
# Define file paths in your Google Drive
# Define file paths in your Google Drive for saving the matrices
user_similarity_save_path = '/content/drive/MyDrive/sem7/BDA/MP-mtx_files/user_similarity_matrix.npz'
item_similarity_save_path = '/content/drive/MyDrive/sem7/BDA/MP-mtx_files/item_similarity_matrix.npz'
sparse_matrix_save_path = '/content/drive/MyDrive/sem7/BDA/MP-mtx_files/sparse_matrix.npz'
# Save the matrices in .npz format
scipy.sparse.save_npz(user_similarity_save_path, user_similarity_matrix)
scipy.sparse.save_npz(item_similarity_save_path, item_similarity_matrix)
scipy.sparse.save_npz(sparse_matrix_save_path, sparse_matrix)

print("Matrices saved successfully in .npz format.")

Matrices saved successfully in .npz format.


**User Based Collaborative Filtering**

In [None]:
# Collaborative filtering
def user_based(user_sim_matrix, ratings_matrix, user_idx, num_similar_users, num_recommendations):
    # Get indices of the most similar users for the specified user
    similar_users = np.argsort(-user_sim_matrix[user_idx])[:num_similar_users]

    # Initialize a dictionary to store aggregated recommendation scores
    item_scores = {}

    for sim_user_idx in similar_users:
        # Get the ratings of the similar user
        user_ratings = ratings_matrix[sim_user_idx].toarray()[0]

        # Update scores for items that the target user hasn't rated yet
        for item_idx, rating in enumerate(user_ratings):
            if ratings_matrix[user_idx, item_idx] == 0:  # Item not rated by target user
                if item_idx not in item_scores:
                    item_scores[item_idx] = 0
                item_scores[item_idx] += rating * user_sim_matrix[user_idx, sim_user_idx]

    # Sort items by aggregated scores and return the top N items
    sorted_items = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)
    return [item[0] for item in sorted_items[:num_recommendations]]

In [None]:
user_idx=10
num_similar_users=5
num_recommendations=10

user_based_recommendation = user_based(item_similarity_matrix, sparse_matrix, user_idx, num_similar_users, num_recommendations)

# Assuming title_id_to_index is structured as {title: index}
index_to_title = {index: title for title, index in title_id_to_index.items()}

# Print the recommended items
print(f"Recommendations for user {user_idx}:")
for item_index in user_based_recommendation:
    # Use the reversed dictionary to get the title
    item_title = index_to_title.get(item_index, "Unknown Item")  # Default if index not found
    print(f"Movie Title : {item_title}")

Recommendations for user 10:
Movie Title : Cosi (1996)
Movie Title : Psycho (1960)
Movie Title : Three Wishes (1995)
Movie Title : If Lucy Fell (1996)
Movie Title : When We Were Kings (1996)
Movie Title : Annie Hall (1977)
Movie Title : Fair Game (1995)
Movie Title : Heavenly Creatures (1994)
Movie Title : Paris, France (1993)
Movie Title : Snow White and the Seven Dwarfs (1937)


**Item Based Collaborative Filtering**

In [None]:
def item_based(item_sim_matrix, ratings_matrix, user_idx, num_recommendations):
    # Get the indices of items rated by the target user
    rated_item_indices = ratings_matrix[user_idx].nonzero()[1]

    # Initialize a dictionary to hold aggregated recommendation scores
    item_scores = {}

    for item_idx in rated_item_indices:
        # Get similarity scores for the rated item
        similar_item_indices = np.argsort(-item_sim_matrix[item_idx].toarray().flatten())

        for sim_item_idx in similar_item_indices:
            if sim_item_idx not in rated_item_indices:  # Not rated by the target user
                if sim_item_idx not in item_scores:
                    item_scores[sim_item_idx] = 0
                # Aggregate scores based on the user's rating and item similarity
                item_scores[sim_item_idx] += ratings_matrix[user_idx, item_idx] * item_sim_matrix[item_idx, sim_item_idx]

    # Sort items based on aggregated scores and return the top N items
    sorted_items = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)
    return [item[0] for item in sorted_items[:num_recommendations]]


In [None]:
user_idx=10
num_recommendations=5

item_based_recommendation = item_based(item_similarity_matrix, sparse_matrix, user_idx, num_recommendations)

# Print the recommended items
print(f"Recommendations for user {user_idx}:")
for item_index in item_based_recommendation:
    # Use the reversed dictionary to get the title
    item_title = index_to_title.get(item_index, "Unknown Item")  # Default if index not found
    print(f"Movie Title : {item_title}")

Recommendations for user 10:
Movie Title : Pulp Fiction (1994)
Movie Title : Silence of the Lambs, The (1991)
Movie Title : Monty Python and the Holy Grail (1974)
Movie Title : Empire Strikes Back, The (1980)
Movie Title : Back to the Future (1985)


**Exporting the necessities using JOBLIB**

In [None]:
import joblib

# Specify the path in your Google Drive
save_path = '/content/drive/MyDrive/sem7/BDA/MP-mtx_files/'

# Save the necessary matrices and mappings

joblib.dump(user_id_to_index, save_path + 'user_id_to_index.pkl')
joblib.dump(index_to_title, save_path + 'index_to_title.pkl')

['/content/drive/MyDrive/sem7/BDA/MP-mtx_files/item_based_predict.pkl']