In [2]:
import pandas as pd
import numpy as np

# Load the data
data = pd.read_csv('ratings.csv')

# Preprocess the data
# Remove duplicates based on 'userId' and 'rating'
data = data.drop_duplicates(subset=['userId', 'rating'])

# Convert 'rating' to integer after filling NaNs with 0 if necessary
data['rating'] = data['rating'].fillna(0).astype(int)

# Convert 'timestamp' from UNIX format to datetime
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')

# Create the user-item matrix without filling NaNs after pivoting
user_item_matrix = data.pivot_table(index='userId', columns='movieId', values='rating')

# Fill all NaN values with random integers between 1 and 5 for demonstration
user_item_matrix = user_item_matrix.applymap(lambda x: np.random.randint(1, 6) if pd.isna(x) else x)

# Select a random subset (8x8) of the matrix
random_user_ids = np.random.choice(user_item_matrix.index, 8, replace=False)
random_movie_ids = np.random.choice(user_item_matrix.columns, 8, replace=False)
user_item_matrix_subset = user_item_matrix.loc[random_user_ids, random_movie_ids]

# Randomly introduce 3-4 NaN values into the subset matrix
num_nan_values = np.random.choice([3, 4])  # Randomly choose between 3-4 NaN values
nan_indices = [(np.random.choice(user_item_matrix_subset.index), np.random.choice(user_item_matrix_subset.columns)) for _ in range(num_nan_values)]
for row, col in nan_indices:
    user_item_matrix_subset.at[row, col] = np.nan

# Display the subset with the title
print("8x8 User-Item Matrix with 3-4 Random Null Values:")
print(user_item_matrix_subset)

  user_item_matrix = user_item_matrix.applymap(lambda x: np.random.randint(1, 6) if pd.isna(x) else x)


8x8 User-Item Matrix with 3-4 Random Null Values:
movieId  230   6062  71    733   99    252   1347  8533
userId                                                 
506       1.0   1.0   1.0   1.0   4.0   4.0   2.0   4.0
514       4.0   2.0   2.0   5.0   1.0   2.0   2.0   1.0
67        2.0   5.0   5.0   4.0   4.0   3.0   2.0   3.0
75        2.0   4.0   2.0   5.0   3.0   2.0   1.0   1.0
129       2.0   5.0   1.0   4.0   2.0   5.0   NaN   5.0
495       3.0   NaN   3.0   4.0   NaN   2.0   5.0   3.0
15        3.0   2.0   4.0   1.0   4.0   4.0   5.0   1.0
535       3.0   2.0   3.0   3.0   2.0   2.0   4.0   3.0


In [3]:
import pandas as pd
import numpy as np

# Example: Creating a user-item matrix with random ratings (1-5)
user_item_matrix = pd.DataFrame(
    np.random.randint(1, 6, size=(5, 5)), 
    columns=['Item1', 'Item2', 'Item3', 'Item4', 'Item5'], 
    index=['User1', 'User2', 'User3', 'User4', 'User5']
)

# Optional: Add some missing values to simulate real-world data
user_item_matrix.iloc[1, 2] = np.nan
user_item_matrix.iloc[3, 0] = np.nan
print(user_item_matrix)


       Item1  Item2  Item3  Item4  Item5
User1    3.0      2    5.0      2      5
User2    5.0      1    NaN      5      2
User3    4.0      2    4.0      5      3
User4    NaN      1    5.0      1      2
User5    2.0      1    2.0      2      3


In [4]:
# Calculate the average rating for each item
average_ratings = user_item_matrix.mean(axis=0)
print("Average Ratings per Item:")
print(average_ratings)

# Calculate the overall average rating
overall_average = user_item_matrix.stack().mean()  # Excludes NaN values automatically
print("Overall Average Rating:", overall_average)


Average Ratings per Item:
Item1    3.5
Item2    1.4
Item3    4.0
Item4    3.0
Item5    3.0
dtype: float64
Overall Average Rating: 2.9130434782608696


In [5]:
import pandas as pd
import numpy as np

# Sample data representing the movie ratings by each user
data = {
    '53000': [5.0, 3.0, 3.0, 5.0, 4.0, 1.0, 5.0, 4.0],
    '2539': [4.0, 1.0, 2.0, 5.0, 4.0, 5.0, 5.0, 4.0],
    '2455': [2.0, 3.0, 4.0, 2.0, 5.0, 2.0, 5.0, np.nan],
    '215': [np.nan, 1.0, 2.0, 1.0, 2.0, 4.0, 3.0, 1.0],
    '49013': [1.0, 2.0, 2.0, 2.0, 2.0, 4.0, np.nan, 5.0],
    '1033': [np.nan, 3.0, 3.0, 5.0, 2.0, 1.0, 5.0, 4.0],
    '786': [3.0, 3.0, 3.0, 1.0, 3.0, 1.0, 1.0, 1.0],
    '833': [3.0, 3.0, 3.0, 1.0, 5.0, 4.0, 2.0, 4.0]
}

# Create DataFrame from the sample data with user IDs as index
user_ids = [533, 244, 651, 653, 210, 410, 320, 292]
ratings_df = pd.DataFrame(data, index=user_ids)

# Calculate the average rating for each user, ignoring NaN values
ratings_df['average_rating'] = ratings_df.mean(axis=1)

# Display the result
print(ratings_df[['average_rating']])


     average_rating
533        3.000000
244        2.375000
651        2.750000
653        2.750000
210        3.375000
410        2.750000
320        3.714286
292        3.285714


In [6]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

# Updated user-item rating matrix
data = {
    'movieId': [53000, 2539, 2455, 215, 49013, 1033, 786, 833],
    'user_533': [5.0, 4.0, 2.0, np.nan, 1.0, np.nan, 3.0, 3.0],
    'user_244': [3.0, 1.0, 3.0, 1.0, 2.0, 3.0, 3.0, 3.0],
    'user_651': [3.0, 2.0, 4.0, 2.0, 2.0, 3.0, 3.0, 3.0],
    'user_653': [5.0, 5.0, 2.0, 1.0, 2.0, 5.0, 1.0, 1.0],
    'user_210': [4.0, 4.0, 5.0, 2.0, 2.0, 2.0, 3.0, 5.0],
    'user_410': [1.0, 5.0, 2.0, 4.0, 4.0, 1.0, 4.0, 4.0],
    'user_320': [5.0, 5.0, 5.0, 3.0, np.nan, 5.0, 1.0, 2.0],
    'user_292': [4.0, 4.0, np.nan, 1.0, 5.0, 4.0, 1.0, 4.0]
}
df = pd.DataFrame(data).set_index('movieId')

# Calculate Cosine Similarity for users
cosine_sim = cosine_similarity(df.T.fillna(0))
cosine_sim_df = pd.DataFrame(cosine_sim, index=df.columns, columns=df.columns)

# Function to calculate Pearson correlation for each user pair
def pearson_corr_matrix(df):
    user_corr = pd.DataFrame(index=df.columns, columns=df.columns)
    for u in df.columns:
        for v in df.columns:
            user_corr.loc[u, v] = pearsonr(df[u].fillna(0), df[v].fillna(0))[0]
    return user_corr

pearson_sim_df = pearson_corr_matrix(df)

# Prediction function for User-Based Collaborative Filtering
def predict_user_based(user, item_id, similarity_matrix, ratings_df, method='cosine'):
    if method == 'cosine':
        sim_scores = similarity_matrix[user]
    elif method == 'pearson':
        sim_scores = similarity_matrix[user]
    
    rated_by_other_users = ratings_df.loc[item_id].dropna()
    weighted_sum = np.dot(sim_scores[rated_by_other_users.index], rated_by_other_users)
    sum_of_weights = np.abs(sim_scores[rated_by_other_users.index]).sum()
    
    return weighted_sum / sum_of_weights if sum_of_weights != 0 else np.nan

# Generate predictions for a specific user and item using both similarity measures
user = 'user_533'
item_id = 2455  # movie ID example

cosine_pred = predict_user_based(user, item_id, cosine_sim_df, df, method='cosine')
pearson_pred = predict_user_based(user, item_id, pearson_sim_df, df, method='pearson')

print(f"Predicted rating for {user} on item {item_id} using Cosine Similarity: {cosine_pred}")
print(f"Predicted rating for {user} on item {item_id} using Pearson Correlation: {pearson_pred}")

# Top-N Recommendations
def top_n_recommendations(user, n, similarity_matrix, ratings_df, method='cosine'):
    preds = []
    for item_id in ratings_df.index:
        if pd.isna(ratings_df.loc[item_id, user]):  # Only predict for items not yet rated
            pred_rating = predict_user_based(user, item_id, similarity_matrix, ratings_df, method)
            preds.append((item_id, pred_rating))
    preds = sorted(preds, key=lambda x: x[1], reverse=True)
    return preds[:n]

top_n_cosine = top_n_recommendations(user, 3, cosine_sim_df, df, method='cosine')
top_n_pearson = top_n_recommendations(user, 3, pearson_sim_df, df, method='pearson')

print(f"Top-N Recommendations for {user} using Cosine Similarity: {top_n_cosine}")
print(f"Top-N Recommendations for {user} using Pearson Correlation: {top_n_pearson}")


Predicted rating for user_533 on item 2455 using Cosine Similarity: 3.274424836891068
Predicted rating for user_533 on item 2455 using Pearson Correlation: 3.226854675967426
Top-N Recommendations for user_533 using Cosine Similarity: [(1033, 3.260398314864968), (215, 1.9856102416780501)]
Top-N Recommendations for user_533 using Pearson Correlation: [(1033, 3.260112461472429), (215, 1.778609830632975)]
