In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Loading Datasets

In [70]:
# Correct paths to your files
ratings_path = r'D:\IIIT-H_Temp\Learning\IIITH\Data Analytics\Assignment 3\ml-latest-small\ml-latest-small\ratings.csv'
movies_path = r'D:\IIIT-H_Temp\Learning\IIITH\Data Analytics\Assignment 3\ml-latest-small\ml-latest-small\movies.csv'

# Load the datasets into pandas DataFrames
ratings = pd.read_csv(ratings_path)
movies = pd.read_csv(movies_path)

# Now display the first few rows of each DataFrame
ratings.head(), movies.head()

(   userId  movieId  rating  timestamp
 0       1        1     4.0  964982703
 1       1        3     4.0  964981247
 2       1        6     4.0  964982224
 3       1       47     5.0  964983815
 4       1       50     5.0  964982931,
    movieId                               title  \
 0        1                    Toy Story (1995)   
 1        2                      Jumanji (1995)   
 2        3             Grumpier Old Men (1995)   
 3        4            Waiting to Exhale (1995)   
 4        5  Father of the Bride Part II (1995)   
 
                                         genres  
 0  Adventure|Animation|Children|Comedy|Fantasy  
 1                   Adventure|Children|Fantasy  
 2                               Comedy|Romance  
 3                         Comedy|Drama|Romance  
 4                                       Comedy  )

### Checking for Missing Values

In [76]:
# Check for missing values in the ratings dataset
print("Missing values in ratings dataset:\n", ratings.isnull().sum())

# Check for missing values in the movies dataset
print("Missing values in movies dataset:\n", movies.isnull().sum())

Missing values in ratings dataset:
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
Missing values in movies dataset:
 movieId    0
title      0
genres     0
dtype: int64


### Checking for Duplicate Values

In [75]:
# Check for duplicate rows in the ratings dataset
duplicate_ratings = ratings.duplicated().sum()
print(f"Number of duplicate rows in ratings dataset: {duplicate_ratings}")

# If duplicates exist, remove them
if duplicate_ratings > 0:
    ratings = ratings.drop_duplicates()

# Check for duplicate rows in the movies dataset
duplicate_movies = movies.duplicated().sum()
print(f"Number of duplicate rows in movies dataset: {duplicate_movies}")

# If duplicates exist, remove them
if duplicate_movies > 0:
    movies = movies.drop_duplicates()

Number of duplicate rows in ratings dataset: 0
Number of duplicate rows in movies dataset: 0


### Filtering Users Who Rated More Than 10 Movies

In [77]:
# Group by userId and count the number of movies rated by each user
user_ratings_count = ratings.groupby('userId').size()

# Filter users who have rated more than 10 movies
valid_users = user_ratings_count[user_ratings_count > 10].index

# Filter the ratings data to only include valid users
filtered_ratings = ratings[ratings['userId'].isin(valid_users)]

# Display the filtered dataset
filtered_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Filtering Movies Rated Above 2

In [78]:
# Filter movies with ratings greater than 2
filtered_ratings = filtered_ratings[filtered_ratings['rating'] > 2]

# Display the filtered ratings
filtered_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Creating the Transactional Dataset

In [79]:
# Group by userId and aggregate movieIds into a list
transactional_data = filtered_ratings.groupby('userId')['movieId'].apply(list)

# Convert the transactional data into a DataFrame
transactional_data = pd.DataFrame(transactional_data).reset_index()

# Display the transactional data
transactional_data.head()

Unnamed: 0,userId,movieId
0,1,"[1, 3, 6, 47, 50, 70, 101, 110, 151, 157, 163,..."
1,2,"[318, 333, 1704, 3578, 6874, 8798, 46970, 4851..."
2,3,"[849, 1275, 1371, 1587, 2288, 2851, 3024, 3703..."
3,4,"[21, 45, 52, 58, 106, 125, 162, 171, 176, 215,..."
4,5,"[1, 21, 34, 36, 39, 50, 58, 110, 150, 153, 232..."


### Splitting into Training and Test Sets

In [80]:
from sklearn.model_selection import train_test_split

# Split the dataset into 80% training and 20% test set
train_data, test_data = train_test_split(transactional_data, test_size=0.2, random_state=42)

# Create a test set by removing 20% of the movies from each user
def create_test_set(movies):
    test_size = int(len(movies) * 0.2)  # Remove 20% of the movies
    return movies[:test_size]

test_data['movieId'] = test_data['movieId'].apply(create_test_set)

# Display the training and test data
train_data.head(), test_data.head()

(     userId                                            movieId
 23       24  [6, 32, 50, 165, 253, 296, 316, 318, 356, 457,...
 582     583  [39, 104, 216, 239, 296, 317, 356, 588, 593, 5...
 287     288  [1, 3, 10, 17, 21, 32, 34, 43, 47, 73, 87, 110...
 322     323  [1, 2, 17, 19, 22, 29, 32, 34, 36, 48, 50, 60,...
 132     133  [32, 47, 50, 110, 150, 223, 296, 300, 318, 356...,
      userId                                            movieId
 81       82  [1, 2, 6, 10, 47, 104, 110, 145, 153, 163, 165...
 218     219  [1, 2, 6, 10, 19, 21, 32, 47, 50, 104, 112, 15...
 55       56            [10, 11, 19, 39, 47, 69, 110, 153, 160]
 598     599  [1, 2, 6, 7, 10, 11, 16, 17, 18, 19, 21, 24, 2...
 264     265  [32, 36, 110, 111, 150, 230, 253, 260, 266, 28...)

### Commiting to Git!

In [96]:
# Add the notebook file to the staging area
!git add "Assignment 3_64_Mid-Submission.ipynb"

# Commit the changes with a meaningful message
!git commit -m "Completed data preprocessing for movie recommendation system" --verbose

# Push the changes with verbose output for detailed feedback
!git push -u origin main --verbose



[main 672a1fc] Completed data preprocessing for movie recommendation system
 1 file changed, 529 insertions(+), 27 deletions(-)
branch 'main' set up to track 'origin/main'.


POST git-receive-pack (4906 bytes)
Pushing to https://github.com/shuklaganesh/Movie-Recommendation-System.git
To https://github.com/shuklaganesh/Movie-Recommendation-System.git
   669a3f0..672a1fc  main -> main
updating local tracking ref 'refs/remotes/origin/main'
