<a href="https://colab.research.google.com/github/sakshisinghh28/Recommendation-system./blob/main/MovieRecommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip -uq "/content/drive/My Drive/archive.zip" -d "/content/drive/My Drive/Movie"

In [5]:
!ls "/content/drive/My Drive/Movie/ml-100k"


allbut.pl  u1.base  u2.test  u4.base  u5.test  ub.base	u.genre  u.occupation
mku.sh	   u1.test  u3.base  u4.test  ua.base  ub.test	u.info	 u.user
README	   u2.base  u3.test  u5.base  ua.test  u.data	u.item


In [16]:
from pathlib import Path
import pandas as pd

# Update the path to your dataset location
path = Path('/content/drive/MyDrive/Movie/ml-100k')

# List of data files
data_files = ['u.data', 'u.item', 'u.user', 'u.genre', 'u.occupation',
              'u1.base', 'u1.test', 'u2.base', 'u2.test',
              'u3.base', 'u3.test', 'u4.base', 'u4.test',
              'u5.base', 'u5.test', 'ua.base', 'ua.test',
              'ub.base', 'ub.test', 'u.info']

# Dictionary to store DataFrames
dfs = {}

# Load each dataset into respective DataFrames
for file in data_files:
    key = file.split('.')[0].replace('-', '_')

    try:
        if file == 'u.data':
            dfs['u_data'] = pd.read_csv(path / file, delimiter='\t', encoding='latin-1', header=None, names=['userId', 'itemId', 'rating'], usecols=(0,1,2))
        else:
            dfs[key] = pd.read_csv(path / file, delimiter='|' if file != 'u.data' else '\t', encoding='latin-1')

        print(f"Loaded {file} successfully.")

    except pd.errors.ParserError as e:
        print(f"ParserError: Failed to read {file} - {e}")

    except Exception as e:
        print(f"Error: {e} - {file} not loaded.")

# Display the first few rows of the 'u_data' DataFrame if loaded successfully
if 'u_data' in dfs and isinstance(dfs['u_data'], pd.DataFrame):
    print(dfs['u_data'].head())
else:
    print("Warning: 'u_data' not loaded successfully or not a DataFrame.")

# Optionally, print loaded keys for verification
print("Loaded keys:", list(dfs.keys()))


Loaded u.data successfully.
Loaded u.item successfully.
Loaded u.user successfully.
Loaded u.genre successfully.
Loaded u.occupation successfully.
Loaded u1.base successfully.
Loaded u1.test successfully.
Loaded u2.base successfully.
Loaded u2.test successfully.
Loaded u3.base successfully.
Loaded u3.test successfully.
Loaded u4.base successfully.
Loaded u4.test successfully.
Loaded u5.base successfully.
Loaded u5.test successfully.
Loaded ua.base successfully.
Loaded ua.test successfully.
Loaded ub.base successfully.
Loaded ub.test successfully.
Loaded u.info successfully.
   userId  itemId  rating
0     196     242       3
1     186     302       3
2      22     377       1
3     244      51       2
4     166     346       1
Loaded keys: ['u_data', 'u', 'u1', 'u2', 'u3', 'u4', 'u5', 'ua', 'ub']


In [18]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd


# Example of constructing a user-item matrix (pivot table)
u_data = dfs['u_data']

user_item_matrix = u_data.pivot(index='userId', columns='itemId', values='rating').fillna(0)

# Initialize NearestNeighbors model
k = 5  # Number of neighbors to consider
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=k)

# Fit the model with the user-item matrix
model_knn.fit(user_item_matrix.values)

# Example of getting nearest neighbors for a user (e.g., userId 1)
user_id = 1
user_ratings = user_item_matrix.loc[user_id].values.reshape(1, -1)
distances, indices = model_knn.kneighbors(user_ratings, n_neighbors=k+1)

# Recommend items for the user based on neighbors
similar_users = user_item_matrix.iloc[indices.flatten()[1:]]
predicted_ratings = similar_users.mean(axis=0)

# Print recommended items with the highest predicted ratings
recommended_items = predicted_ratings.sort_values(ascending=False).index.tolist()
print(f"Recommended items for user {user_id}: {recommended_items}")


Recommended items for user 1: [50, 174, 56, 98, 176, 172, 12, 11, 181, 203, 96, 195, 173, 168, 7, 100, 183, 1, 228, 433, 234, 153, 474, 31, 144, 169, 4, 273, 64, 219, 265, 134, 250, 268, 210, 79, 91, 55, 82, 246, 89, 239, 204, 566, 159, 186, 235, 9, 69, 222, 182, 72, 71, 217, 238, 73, 121, 226, 179, 223, 117, 257, 684, 58, 382, 403, 52, 655, 53, 732, 194, 208, 42, 22, 227, 191, 218, 288, 715, 196, 184, 546, 200, 164, 24, 2, 318, 568, 157, 156, 154, 190, 483, 109, 209, 202, 201, 28, 527, 735, 161, 258, 561, 559, 135, 781, 385, 25, 405, 423, 95, 132, 679, 123, 582, 188, 125, 286, 48, 230, 276, 294, 67, 68, 54, 367, 249, 81, 151, 216, 451, 229, 408, 640, 425, 215, 475, 357, 729, 1073, 562, 171, 636, 597, 628, 531, 214, 399, 180, 402, 198, 1016, 8, 124, 39, 88, 62, 49, 5, 111, 92, 421, 127, 77, 386, 930, 333, 175, 470, 211, 86, 97, 83, 189, 652, 713, 290, 710, 143, 160, 163, 569, 825, 237, 116, 87, 29, 550, 206, 685, 578, 436, 583, 721, 85, 763, 790, 13, 466, 541, 755, 432, 252, 70, 708, 1

In [20]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357237 sha256=26aa4fc77c40f1b49a4c7c5201e1f4b064ebaaf0a97e2444b03fe9f9ead60e11
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully inst

In [22]:
# Check the columns in your DataFrame
print(df.columns)

# Verify the presence of required columns
if 'userId' in df.columns and 'movieId' in df.columns and 'rating' in df.columns:
    # Proceed with loading data into Surprise's Dataset
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
else:
    print("Required columns ('userId', 'movieId', 'rating') not found in DataFrame.")


Index(['userId', 'itemId', 'rating', 'timestamp'], dtype='object')
Required columns ('userId', 'movieId', 'rating') not found in DataFrame.


In [25]:
from surprise import KNNBasic, Dataset, Reader
import numpy as np

# Assuming 'df' is your DataFrame containing userId, itemId, and rating columns

# Load the data from the DataFrame into Surprise's Dataset format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['userId', 'itemId', 'rating']], reader)

# Use the KNNBasic algorithm
sim_options = {
    'name': 'cosine',  # Use cosine similarity
    'user_based': False  # Item-based collaborative filtering
}
knn_model = KNNBasic(sim_options=sim_options)

# Train the model on the dataset
trainset = data.build_full_trainset()
knn_model.fit(trainset)

# Example: Get top 10 item recommendations for a specific user
user_id = 1  # Replace with the user ID for whom you want recommendations
top_n = 3  # Number of recommendations to get

# Get the list of all item IDs
all_item_ids = df['itemId'].unique()

# Remove items that the user has already rated
items_rated_by_user = df[df['userId'] == user_id]['itemId'].unique()
items_to_predict = np.setdiff1d(all_item_ids, items_rated_by_user)

# Predict ratings for items that the user hasn't rated
testset = [[user_id, item_id, 4.] for item_id in items_to_predict]  # Use a placeholder rating (e.g., 4.0)
predictions = knn_model.test(testset)

# Sort predictions by estimated rating in descending order
predictions.sort(key=lambda x: x.est, reverse=True)

# Extract top N item IDs from the sorted predictions
top_item_ids = [int(pred.iid) for pred in predictions[:top_n]]

# Print the top N recommended item IDs
print(f"Top {top_n} recommended item IDs for user {user_id}: {top_item_ids}")


Computing the cosine similarity matrix...
Done computing similarity matrix.
Top 3 recommended item IDs for user 1: [242, 302, 377]


In [26]:
from surprise.model_selection import train_test_split

# Assuming 'data' is your Surprise Dataset object
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


In [27]:
from surprise import KNNBasic

# Example configuration for KNNBasic
sim_options = {
    'name': 'cosine',  # Use cosine similarity
    'user_based': False  # Item-based collaborative filtering
}
knn_model = KNNBasic(sim_options=sim_options)

# Train the model on the training set
knn_model.fit(trainset)


Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7998ac6b9150>

In [28]:
# Get predictions for the test set
predictions = knn_model.test(testset)


In [29]:
from surprise import accuracy

# Compute RMSE
rmse = accuracy.rmse(predictions)

# Compute MAE
mae = accuracy.mae(predictions)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")


RMSE: 2.0000
MAE:  2.0000
RMSE: 2.0
MAE: 2.0


In [31]:
from surprise.model_selection import cross_validate

# Perform cross-validation with a lower number of folds
results = cross_validate(knn_model, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

# Extract and print average results
print("Average RMSE:", np.mean(results['test_rmse']))
print("Average MAE:", np.mean(results['test_mae']))


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.0000  2.0000  1.0000  1.3333  0.4714  
MAE (testset)     1.0000  2.0000  1.0000  1.3333  0.4714  
Fit time          0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    
Average RMSE: 1.3333333333333333
Average MAE: 1.3333333333333333
