In [11]:
import pandas as pd

def generate_user_profile(ratings_file, genres_file, output_file=None):
    # Load datasets
    ratings_df = pd.read_csv(ratings_file)
    course_genres_df = pd.read_csv(genres_file)

    # Merge ratings with course genres on item and COURSE_ID
    merged_df = pd.merge(
        ratings_df, course_genres_df,
        left_on='item', right_on='COURSE_ID'
    )
    
    # Identify genre columns (assuming they start from the 3rd column in course_genres_df)
    genre_columns = course_genres_df.columns[2:]
    
    # Weight genre columns by ratings
    for genre in genre_columns:
        merged_df[genre] = merged_df[genre] * merged_df['rating']
    
    # Aggregate user profiles by computing the mean
    user_profiles = merged_df.groupby('user')[genre_columns].mean().reset_index()
    
    # Normalize user profiles (to make values sum to 1 for each user)
    user_profiles[genre_columns] = user_profiles[genre_columns].div(
        user_profiles[genre_columns].sum(axis=1), axis=0
    )
    
    # Save user profiles to CSV if output_file is provided
    if output_file:
        user_profiles.to_csv(output_file, index=False)
    
    return user_profiles

# Specify the file paths
ratings_file = 'ratings.csv'
genres_file = 'course_genre.csv'
output_file = 'user_profile.csv'  # Optional

# Generate user profiles and save to CSV
profile_df = generate_user_profile(ratings_file, genres_file, output_file)

# Display user profiles
profile_df

Unnamed: 0,user,Database,Python,CloudComputing,DataAnalysis,Containers,MachineLearning,ComputerVision,DataScience,BigData,Chatbot,R,BackendDev,FrontendDev,Blockchain
0,2,0.179310,0.048276,0.020690,0.148276,0.010345,0.113793,0.0,0.100000,0.141379,0.006897,0.062069,0.117241,0.031034,0.020690
1,4,0.273973,0.013699,0.027397,0.191781,0.000000,0.095890,0.0,0.136986,0.164384,0.000000,0.041096,0.041096,0.000000,0.013699
2,5,0.125000,0.041667,0.093750,0.125000,0.000000,0.156250,0.0,0.114583,0.072917,0.010417,0.072917,0.135417,0.020833,0.031250
3,7,0.333333,0.000000,0.000000,0.333333,0.000000,0.000000,0.0,0.000000,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000
4,8,0.333333,0.000000,0.000000,0.222222,0.000000,0.000000,0.0,0.000000,0.333333,0.000000,0.111111,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33923,2103062,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
33924,2103063,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
33925,2103064,0.000000,0.142857,0.000000,0.000000,0.000000,0.142857,0.0,0.142857,0.000000,0.000000,0.000000,0.285714,0.285714,0.000000
33926,2103065,0.250000,0.000000,0.000000,0.000000,0.000000,0.250000,0.0,0.000000,0.250000,0.000000,0.000000,0.250000,0.000000,0.000000


In [17]:
ratings_df = pd.read_csv(ratings_file)
print("Ratings DataFrame unique users:")
print(ratings_df['user'].nunique())
print("Profile DataFrame unique users:")
print(profile_df['user'].nunique())

Ratings DataFrame unique users:
33928
Profile DataFrame unique users:
33928


In [40]:
genres_file = 'course_genre.csv'
course_genres_df = pd.read_csv(genres_file)

In [42]:
user_ratings = ratings_df[ratings_df['user'] == user_id]
enrolled_course_ids = user_ratings['item'].to_list()
all_courses = set(course_genres_df['COURSE_ID'].values)
unknown_courses = all_courses.difference(enrolled_course_ids)
unknown_course_df = course_genres_df[course_genres_df['COURSE_ID'].isin(unknown_courses)]

In [43]:
import numpy as np

# Assuming 'user_vector' is already correctly extracted as you did
print("User vector:", user_vector)

# Assuming 'course_genres_df' is the dataframe with course genres, and 'unknown_course_df' is the filtered subset
# that contains the courses the user has not yet rated.
course_genres = unknown_course_df.iloc[:, 2:].values  # Course genre matrix (exclude COURSE_ID and TITLE)

# Ensure course genres matrix is correct
print("Course genres matrix shape:", course_genres.shape)
print("First course genre vector:", course_genres[0])  # Print the first course genre vector for inspection

# Recommendation score calculation (dot product between user vector and course genre matrix)
recommendation_scores = np.dot(course_genres, user_vector)

# Assuming you have a threshold to filter recommendations
threshold = 0.2  # Adjust this threshold as needed
for i, score in enumerate(recommendation_scores):
    if score >= threshold:
        print(f"Recommended course ID: {unknown_course_df.iloc[i, 0]} with score: {score}")


User vector: [0.         0.14285714 0.         0.         0.         0.14285714
 0.         0.14285714 0.         0.         0.         0.28571429
 0.28571429 0.        ]
Course genres matrix shape: (304, 14)
First course genre vector: [1 0 0 1 0 0 0 0 1 0 1 0 0 0]
Recommended course ID: GPXX0Z2PEN with score: 0.2857142857142857
Recommended course ID: GPXX0SDXEN with score: 0.2857142857142857
Recommended course ID: DX0108EN with score: 0.2857142857142857
Recommended course ID: GPXX0W7KEN with score: 0.2857142857142857
Recommended course ID: GPXX0QR3EN with score: 0.2857142857142857
Recommended course ID: DE0205EN with score: 0.2857142857142857
Recommended course ID: OS0101EN with score: 0.5714285714285714
Recommended course ID: CC0210EN with score: 0.2857142857142857
Recommended course ID: PA0103EN with score: 0.2857142857142857
Recommended course ID: HCC104EN with score: 0.2857142857142857
Recommended course ID: GPXX0A1YEN with score: 0.2857142857142857
Recommended course ID: PA0107EN