In [1]:
# necessary libraries.
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
df = pd.read_csv("Preprocessed_recommendation.csv")
df.sample(3)

Unnamed: 0,UserID,ItemID,Rating,Clicks,Views,TimeSpentOnItem,SessionDuration,DeviceType,Age,Gender,...,Description_198,Description_199,Reviews_Sentiment,Device_Desktop,Device_Mobile,Device_Tablet,Time_Afternoon,Time_Evening,Time_Morning,Time_Night
1506,U02903,Item00368,4,0.473684,0.836735,0.315228,0.995626,1,0.137255,0,...,0.0,0.0,-0.071429,0,1,0,0,0,0,1
2070,U03672,Item00366,3,0.947368,0.591837,0.948461,0.61006,0,0.411765,1,...,0.0,0.0,0.07,0,1,0,0,0,0,1
388,U01920,Item00879,2,0.684211,0.632653,0.357898,0.288586,0,0.215686,1,...,0.0,0.0,0.308333,0,1,0,0,0,0,1


In [3]:
x= df.isnull().sum()
x                # no null values are present 

UserID            0
ItemID            0
Rating            0
Clicks            0
Views             0
                 ..
Device_Tablet     0
Time_Afternoon    0
Time_Evening      0
Time_Morning      0
Time_Night        0
Length: 345, dtype: int64

# Now we can normalize continues feature

In [4]:
scaler = MinMaxScaler()
continous_features = df.select_dtypes(include=['float64','int64']).columns
df[continous_features]  = scaler.fit_transform(df[continous_features])

# Dimensionality reduction
- We can use PCA(Principal component analysis) for dimensionality redunction because there are 345 feature and
  
- **Reduce Complexity:** Lower the number of features, which reduces computation time and the risk of overfitting.
- **Maximize Variance:** Retain the most important information by capturing the main patterns in the data with fewer components.
- **Improve Model Performance:** Make models faster and often more accurate by focusing only on the essential components, leading to better generalization.

In [5]:
item_features = df.drop(columns=['UserID', 'ItemID', 'Rating'])

In [6]:
from sklearn.preprocessing import LabelEncoder

# Apply label encoding to each categorical column
for col in item_features.select_dtypes(include=['object']).columns:
    item_features[col] = LabelEncoder().fit_transform(item_features[col])


In [7]:
pca = PCA(n_components=5)  # Adjust to the number of components for 90-95% variance
item_features_pca = pca.fit_transform(item_features)
print(f"Explained variance ratio with 5 components: {sum(pca.explained_variance_ratio_):.2f}")


Explained variance ratio with 5 components: 0.96


In [8]:
# Convert PCA result back to DataFrame for easy access
item_features_pca_df = pd.DataFrame(item_features_pca, index=df.index)


# Now we convert PCA result back to DataFrame for easy access

In [9]:
item_features_pca_df = pd.DataFrame(item_features_pca,index=df.index)

In [10]:
item_features_pca_df

Unnamed: 0,0,1,2,3,4
0,2496.621891,10.498786,1945.577791,1301.166918,-21.856641
1,203.101736,-1177.197853,51.504451,1591.455389,-929.703596
2,1112.132521,-1234.369673,-2840.340935,1265.087837,1074.737810
3,652.768128,1714.853616,460.724570,-2218.782685,1106.616334
4,2229.785354,1948.417039,326.098225,-278.524300,286.384213
...,...,...,...,...,...
4995,3397.949900,-2127.358355,298.116971,-1088.492689,-313.120495
4996,2060.128061,-2680.005829,704.693090,-476.671509,-636.708697
4997,47.981419,-184.856901,-1975.865660,502.256177,87.907276
4998,-2847.493594,-328.805356,1505.495968,-393.788029,-705.593101


# Now we can Splitting the data

In [11]:
train_data,test_data = train_test_split(df,test_size=0.2,random_state=42)

In [12]:
train_data.head(3)

Unnamed: 0,UserID,ItemID,Rating,Clicks,Views,TimeSpentOnItem,SessionDuration,DeviceType,Age,Gender,...,Description_198,Description_199,Reviews_Sentiment,Device_Desktop,Device_Mobile,Device_Tablet,Time_Afternoon,Time_Evening,Time_Morning,Time_Night
4227,U01536,Item00912,0.0,0.315789,0.714286,0.628548,0.392632,1.0,0.686275,0.5,...,0.0,0.0,0.522876,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4676,U04958,Item00078,0.0,0.789474,0.0,0.686847,0.336698,0.0,0.647059,0.0,...,0.0,0.0,0.471989,0.0,1.0,0.0,0.0,0.0,1.0,0.0
800,U00925,Item00403,0.0,0.052632,0.77551,0.176573,0.834385,1.0,0.196078,0.5,...,0.0,0.0,0.438503,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [13]:
test_data.head(3)

Unnamed: 0,UserID,ItemID,Rating,Clicks,Views,TimeSpentOnItem,SessionDuration,DeviceType,Age,Gender,...,Description_198,Description_199,Reviews_Sentiment,Device_Desktop,Device_Mobile,Device_Tablet,Time_Afternoon,Time_Evening,Time_Morning,Time_Night
1501,U01129,Item00601,0.5,0.526316,0.571429,0.360274,0.318782,1.0,0.72549,0.5,...,0.0,0.0,0.509804,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2586,U04213,Item00999,1.0,0.684211,0.591837,0.125268,0.168391,0.0,0.431373,0.0,...,0.0,0.0,0.411765,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2653,U01608,Item00907,0.25,1.0,0.836735,0.264157,0.066532,1.0,0.705882,0.5,...,0.0,0.0,0.411765,0.0,1.0,0.0,0.0,0.0,0.0,1.0


# Building the Hybrid Recommender System

### Now first we can apply collaborative Filtering(User-item Matrix)


In [14]:
# Aggregate ratings for each UserID-ItemID pair (e.g., by taking the mean of duplicate ratings)
train_data_agg = train_data.groupby(['UserID', 'ItemID'])['Rating'].mean().reset_index()

# Now pivot the table to create the user-item matrix
user_item_matrix = train_data_agg.pivot(index='UserID', columns='ItemID', values='Rating').fillna(0)


In [15]:
user_item_matrix   # The user_item_matrix you see is a pivoted table where rows represent users and columns represent items. 
                    # The values in the matrix represent the ratings that each user has given to each item,

ItemID,Item00001,Item00002,Item00003,Item00004,Item00005,Item00006,Item00007,Item00008,Item00009,Item00010,...,Item00990,Item00991,Item00992,Item00993,Item00994,Item00995,Item00996,Item00998,Item00999,Item01000
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U00001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U04993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U04994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U04995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U04997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Convert user_item_matrix to a sparse matrix (CSR format)
user_item_csr = csr_matrix(user_item_matrix.values)

In [17]:
# Fit NearestNeighbors model for user-based collaborative filtering
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(user_item_csr)


# Content-Based Filtering (Item Similarity Based on PCA Features)



In [18]:
# we can calculate item similarity using PCA-transformed features
item_similarity_matrix = cosine_similarity(item_features_pca_df)
item_similarity_df = pd.DataFrame(item_similarity_matrix,index=item_features_pca_df.index,columns=item_features_pca_df.index)

In [19]:
item_similarity_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,1.0,0.357239,-0.090373,-0.034394,0.569024,-0.373776,0.425822,0.474886,-0.423964,-0.8659,...,0.941084,-0.569805,0.079467,0.573533,0.567545,0.533929,0.484882,-0.438373,-0.409839,0.091635
1,0.357239,1.0,0.314531,-0.937306,-0.383663,-0.485199,-0.333674,-0.0616,-0.161287,-0.663013,...,0.279539,-0.412438,0.185232,0.458305,0.832386,0.192749,0.442264,0.187259,-0.011516,0.868516
2,-0.090373,0.314531,1.0,-0.375576,-0.080887,-0.46887,-0.275544,-0.896554,-0.336139,0.034082,...,-0.099384,-0.201248,-0.557128,0.286956,0.537787,0.249763,0.176909,0.876866,-0.674331,0.239791


# now we Making Recommendations

### First we create a Collaborative Recommendation Function

In [20]:
# Collaborative Recommendations Function with index check
def collaborative_recommendations(user_id, num_recommendations=5):
    # Check if user_id exists in the user_item_matrix
    if user_id not in user_item_matrix.index:
        print(f"Skipping user_id: {user_id} as they are not in the matrix.")
        return []
    
    user_index = user_item_matrix.index.get_loc(user_id)
    distances, indices = model_knn.kneighbors(user_item_matrix.iloc[user_index, :].values.reshape(1, -1), n_neighbors=min(num_recommendations + 1, user_item_matrix.shape[0]))
    rec_indices = indices.flatten()[1:]  # Skip the user itself
    
    # Ensure indices are within bounds
    rec_indices = [i for i in rec_indices if i < len(user_item_matrix.columns)]
    
    return user_item_matrix.columns[rec_indices].tolist()

### Second we create a content-based Recommendation function

In [21]:
def content_based_recommendations(item_id, num_recommendations=5):
    # Check if the item exists in the item similarity matrix
    if item_id not in item_similarity_df.index:
        return []  # Return empty list if item doesn't exist
    
    # Get the most similar items based on the item similarity matrix
    rec_items = item_similarity_df.loc[item_id].sort_values(ascending=False).iloc[1:num_recommendations + 1].index
    
    return rec_items.tolist()

### Third we can create a Hybrid recommendation Function

In [22]:
def hybrid_recommendations(user_id, item_id, num_recommendations=5):
    # Get collaborative recommendations (half of the total recommendations)
    collab_recs = collaborative_recommendations(user_id, num_recommendations // 2)
    
    # Get content-based recommendations (half of the total recommendations)
    content_recs = content_based_recommendations(item_id, num_recommendations // 2)
    
    # Combine and return unique recommendations from both methods
    return list(set(collab_recs + content_recs))

# last Step we can Evaluating the model

In [23]:
popular_items = train_data.groupby('ItemID')['Rating'].mean().sort_values(ascending=False).index[:5].tolist()

In [24]:
# Evaluate Model Function with Presence Check and Default Rating
def evaluate_model(test_data, num_recommendations=5):
    errors = []
    for _, row in test_data.iterrows():
        user_id = row['UserID']
        item_id = row['ItemID']
        true_rating = row['Rating']
        
        # Check if user_id exists in user_item_matrix
        if user_id not in user_item_matrix.index:
            # Cold-start: Recommend popular items
            recommended_items = popular_items
            print(f"Cold-start for user_id: {user_id}. Recommending popular items.")
        else:
            # Check if item_id exists in the user_item_matrix before accessing
            if item_id in user_item_matrix.columns:
                # Get hybrid recommendations for existing users and items
                recommended_items = hybrid_recommendations(user_id, item_id, num_recommendations)
            else:
                print(f"Item cold-start for item_id: {item_id} with user_id: {user_id}.")
                continue

        # Predict rating if the item is in the recommendations
        if item_id in recommended_items and user_id in user_item_matrix.index and item_id in user_item_matrix.columns:
            predicted_rating = user_item_matrix.loc[user_id, item_id]
        else:
            # Default to global average rating if prediction is unavailable
            predicted_rating = user_item_matrix[user_item_matrix > 0].mean().mean()

        # Compute error for the prediction
        errors.append((true_rating - predicted_rating) ** 2)
    
    # Calculate MSE
    mse = np.mean(errors) if errors else np.nan
    return mse



In [25]:
# Assuming mse_score is already calculated
mse_score = evaluate_model(test_data)  # Replace with your MSE calculation function

# Calculate RMSE
rmse_score = np.sqrt(mse_score)
print(f"Mean Squared Error of the model: {mse_score}")
print(f"Root Mean Squared Error of the model: {rmse_score}")

Cold-start for user_id: U04213. Recommending popular items.
Cold-start for user_id: U01951. Recommending popular items.
Cold-start for user_id: U02997. Recommending popular items.
Cold-start for user_id: U04770. Recommending popular items.
Cold-start for user_id: U04635. Recommending popular items.
Cold-start for user_id: U04474. Recommending popular items.
Cold-start for user_id: U03436. Recommending popular items.
Cold-start for user_id: U02842. Recommending popular items.
Cold-start for user_id: U03714. Recommending popular items.
Cold-start for user_id: U04626. Recommending popular items.
Cold-start for user_id: U02365. Recommending popular items.
Cold-start for user_id: U03363. Recommending popular items.
Cold-start for user_id: U00225. Recommending popular items.
Cold-start for user_id: U00078. Recommending popular items.
Cold-start for user_id: U00960. Recommending popular items.
Cold-start for user_id: U01322. Recommending popular items.
Cold-start for user_id: U00560. Recommen

- In conclusion, my hybrid recommendation system’s performance was evaluated using **Mean Squared Error (MSE)** and **Root Mean Squared Error (RMSE)** metrics. The model achieved an MSE of **0.1431** and an RMSE of **0.3784**, indicating that, on average, the recommendations are close to the actual user ratings. These low error values demonstrate a high level of accuracy in the model’s predictions, effectively aligning with user preferences and supporting its reliability for practical deployment.