## Content-based Course Recommender System using Course Similarities

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

%matplotlib inline

rs = 123

In [2]:
course_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/course_processed.csv"
course_df = pd.read_csv(course_url)
course_df.head()

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
0,ML0201EN,robots are coming build iot apps with watson ...,have fun with iot and learn along the way if ...
1,ML0122EN,accelerating deep learning with gpu,training complex deep learning models with lar...
2,GPXX0ZG0EN,consuming restful services using the reactive ...,learn how to use a reactive jax rs client to a...
3,RP0105EN,analyzing big data in r using apache spark,apache spark is a popular cluster computing fr...
4,GPXX0Z2PEN,containerizing packaging and running a sprin...,learn how to containerize package and run a ...


In [3]:
bow_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/courses_bows.csv"
bow_df = pd.read_csv(bow_url)
bow_df.head()

Unnamed: 0,doc_index,doc_id,token,bow
0,0,ML0201EN,ai,2
1,0,ML0201EN,apps,2
2,0,ML0201EN,build,2
3,0,ML0201EN,cloud,1
4,0,ML0201EN,coming,1


In [4]:
sim_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/sim.csv"

similarity_df = pd.read_csv(sim_url)
similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,297,298,299,300,301,302,303,304,305,306
0,1.000000,0.088889,0.088475,0.065556,0.048810,0.104685,0.065202,0.143346,0.000000,0.024405,...,0.012695,0.070225,0.058224,0.046610,0.025850,0.033944,0.076825,0.072898,0.039276,0.121113
1,0.088889,1.000000,0.055202,0.057264,0.012182,0.078379,0.032545,0.119251,0.044162,0.000000,...,0.180593,0.124631,0.087187,0.093060,0.019354,0.028239,0.063911,0.138270,0.031367,0.076940
2,0.088475,0.055202,1.000000,0.026463,0.039406,0.000000,0.000000,0.154303,0.000000,0.000000,...,0.040996,0.037796,0.013430,0.037630,0.000000,0.018270,0.082698,0.133400,0.012684,0.000000
3,0.065556,0.057264,0.026463,1.000000,0.000000,0.250490,0.390038,0.000000,0.000000,0.000000,...,0.151882,0.420084,0.427908,0.055764,0.000000,0.094759,0.030638,0.017443,0.018796,0.158073
4,0.048810,0.012182,0.039406,0.000000,1.000000,0.000000,0.000000,0.085126,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.096877,0.000000,0.060474,0.030415,0.129871,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302,0.033944,0.028239,0.018270,0.094759,0.060474,0.064851,0.053856,0.039467,0.036539,0.060474,...,0.047186,0.116008,0.137406,0.211743,0.256220,1.000000,0.211520,0.168595,0.129766,0.263734
303,0.076825,0.063911,0.082698,0.030638,0.030415,0.000000,0.000000,0.119098,0.055132,0.045622,...,0.177989,0.092381,0.145126,0.527636,0.249675,0.211520,1.000000,0.242269,0.416067,0.178384
304,0.072898,0.138270,0.133400,0.017443,0.129871,0.009285,0.000000,0.254274,0.094165,0.025974,...,0.153128,0.105191,0.138692,0.281108,0.132977,0.168595,0.242269,1.000000,0.220159,0.128902
305,0.039276,0.031367,0.012684,0.018796,0.000000,0.015008,0.024926,0.082199,0.076102,0.055978,...,0.149234,0.093962,0.162170,0.476644,0.222350,0.129766,0.416067,0.220159,1.000000,0.126274


In [5]:
sim_matrix = similarity_df.to_numpy()

In [6]:
course_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/course_processed.csv"
course_df = pd.read_csv(course_url)
course_df.head()

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
0,ML0201EN,robots are coming build iot apps with watson ...,have fun with iot and learn along the way if ...
1,ML0122EN,accelerating deep learning with gpu,training complex deep learning models with lar...
2,GPXX0ZG0EN,consuming restful services using the reactive ...,learn how to use a reactive jax rs client to a...
3,RP0105EN,analyzing big data in r using apache spark,apache spark is a popular cluster computing fr...
4,GPXX0Z2PEN,containerizing packaging and running a sprin...,learn how to containerize package and run a ...


In [7]:
test_users_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/rs_content_test.csv"
test_users_df = pd.read_csv(test_users_url)

In [8]:
test_users = test_users_df.groupby(['user']).max().reset_index(drop=False)
test_user_ids = test_users['user'].to_list()
print(f"Total numbers of test users {len(test_user_ids)}")

Total numbers of test users 1000


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def generate_course_recommendations(user_enrolled_courses, sim_matrix, course_df, score_threshold=11, top_n=5):
    enrolled_indices = [course_df.index[course_df['COURSE_ID'] == course_id].tolist()[0] for course_id in user_enrolled_courses]
    avg_similarity = sim_matrix[enrolled_indices].mean(axis=0)
    sorted_indices = avg_similarity.argsort()[::-1]
    enrolled_course_ids = set(user_enrolled_courses)
    
    recommended_courses = []
    for idx in sorted_indices:
        course_id = course_df.loc[idx, 'COURSE_ID']
        similarity_score = avg_similarity[idx]
        if course_id not in enrolled_course_ids and similarity_score > score_threshold:
            recommended_courses.append((course_id, course_df.loc[idx, 'TITLE'], course_df.loc[idx, 'DESCRIPTION'], similarity_score))
        if len(recommended_courses) == top_n:
            break
    
    recommended_df = pd.DataFrame(recommended_courses, columns=['COURSE_ID', 'TITLE', 'DESCRIPTION', 'SCORE'])
    return recommended_df


In [10]:
u1_recommended_courses_df = generate_course_recommendations(["excourse81", "BD0145EN", "excourse83", "excourse85" ], 
                                                            sim_matrix, course_df, score_threshold=0.5, top_n=20)
u1_recommended_courses_df

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION,SCORE
0,excourse37,data analysis with r programming,this course is the seventh course in the googl...,0.536843
1,excourse82,getting started with data visualization in r,data visualization is a critical skill for any...,0.529153


In [11]:
# Assuming you have the necessary data loaded and preprocessed
u2_recommended_courses_df = generate_course_recommendations(["excourse74", "excourse76", "excourse75"], 
                                                            sim_matrix, course_df, score_threshold=0.5, top_n=20)
u2_recommended_courses_df

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION,SCORE
0,excourse67,introduction to big data,interested in increasing your knowledge of the...,0.629219
1,excourse68,big data modeling and management systems,once you ve identified a big data issue to ana...,0.608453
2,excourse72,foundations for big data analysis with sql,in this course you ll get a big picture view ...,0.578177
3,BD0101EN,big data 101,how big is big and why does big matter and wha...,0.576971
4,excourse32,introduction to data analytics,this course presents a gentle introduction int...,0.542317
5,excourse70,big data capstone project,welcome to the capstone project for big data ...,0.502349


In [12]:
u3_recommended_courses_df = generate_course_recommendations(["DX0108EN", "DS0103EN", "RP0105EN", "DX0106EN"], 
                                                            sim_matrix, course_df, score_threshold=0.5, top_n=20)
u3_recommended_courses_df

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION,SCORE
0,TMP0106,data science bootcamp,a multi day intensive in person data science ...,0.58627
1,DX0107EN,data science bootcamp with python for universi...,data science bootcamp with python for universi...,0.547223
2,TMP107,data science bootcamp with python,data science bootcamp with python,0.523785
3,DS0110EN,data science with open data,data science with open data,0.50023


In [13]:
# Save the DataFrames to a CSV file
# u1_recommended_courses_df.to_csv('recommended_courses.csv', index=False)
# u2_recommended_courses_df.to_csv('recommended_courses.csv', index=False)
# u3_recommended_courses_df.to_csv('recommended_courses.csv', index=False)