In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Loading Datasets

In [None]:
# Correct paths to your files
ratings_path = r'D:\IIIT-H_Temp\Learning\IIITH\Data Analytics\Assignment 3\ml-latest-small\ml-latest-small\ratings.csv'
movies_path = r'D:\IIIT-H_Temp\Learning\IIITH\Data Analytics\Assignment 3\ml-latest-small\ml-latest-small\movies.csv'

# Load the datasets into pandas DataFrames
ratings = pd.read_csv(ratings_path)
movies = pd.read_csv(movies_path)

# Now display the first few rows of each DataFrame
ratings.head(), movies.head()

### Checking for Missing Values

In [None]:
# Check for missing values in the ratings dataset
print("Missing values in ratings dataset:\n", ratings.isnull().sum())

# Check for missing values in the movies dataset
print("Missing values in movies dataset:\n", movies.isnull().sum())

### Checking for Duplicate Values

In [None]:
# Check for duplicate rows in the ratings dataset
duplicate_ratings = ratings.duplicated().sum()
print(f"Number of duplicate rows in ratings dataset: {duplicate_ratings}")

# If duplicates exist, remove them
if duplicate_ratings > 0:
    ratings = ratings.drop_duplicates()

# Check for duplicate rows in the movies dataset
duplicate_movies = movies.duplicated().sum()
print(f"Number of duplicate rows in movies dataset: {duplicate_movies}")

# If duplicates exist, remove them
if duplicate_movies > 0:
    movies = movies.drop_duplicates()

### Filtering Users Who Rated More Than 10 Movies

In [None]:
# Group by userId and count the number of movies rated by each user
user_ratings_count = ratings.groupby('userId').size()

# Filter users who have rated more than 10 movies
valid_users = user_ratings_count[user_ratings_count > 10].index

# Filter the ratings data to only include valid users
filtered_ratings = ratings[ratings['userId'].isin(valid_users)]

# Display the filtered dataset
filtered_ratings.head()

### Filtering Movies Rated Above 2

In [None]:
# Filter movies with ratings greater than 2
filtered_ratings = filtered_ratings[filtered_ratings['rating'] > 2]

# Display the filtered ratings
filtered_ratings.head()

### Creating the Transactional Dataset

In [None]:
# Group by userId and aggregate movieIds into a list
transactional_data = filtered_ratings.groupby('userId')['movieId'].apply(list)

# Convert the transactional data into a DataFrame
transactional_data = pd.DataFrame(transactional_data).reset_index()

# Display the transactional data
transactional_data.head()

### Splitting into Training and Test Sets

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into 80% training and 20% test set
train_data, test_data = train_test_split(transactional_data, test_size=0.2, random_state=42)

# Create a test set by removing 20% of the movies from each user
def create_test_set(movies):
    test_size = int(len(movies) * 0.2)  # Remove 20% of the movies
    return movies[:test_size]

test_data['movieId'] = test_data['movieId'].apply(create_test_set)

# Display the training and test data
train_data.head(), test_data.head()

### Commiting to Git!

In [None]:
!git init

!git remote add origin https://github.com/shuklaganesh/Movie-Recommendation-System.git

!git branch -M main

!git status

!git add "Assignment 3_64_Mid-Submission.ipynb"

!git commit -m "Completed data preprocessing for movie recommendation system"

!git push -u origin main

In [None]:
print("Notebook is still working.")