In [1]:
import os

os.chdir("../../")
os.getcwd()

'/home/sigmoid/developer/ml_projects/personalcoursebuilder'

In [14]:
import re
from datetime import timedelta

def parse_time_string(time_str):
    if "On-Demand" in time_str:
        return 0  # or some default value like a specific number of hours if you prefer

    weeks = days = hours = minutes = 0

    # Regular expressions for matching different time components
    week_match = re.search(r"(\d+)-?(\d+)?\s*weeks?", time_str)
    day_match = re.search(r"(\d+)-?(\d+)?\s*days?", time_str)
    hour_match = re.search(r"(\d+)-?(\d+)?\s*hours?", time_str)
    minute_match = re.search(r"(\d+)-?(\d+)?\s*minutes?", time_str)

    # Extract weeks
    if week_match:
        if week_match.group(2):
            weeks = (int(week_match.group(1)) + int(week_match.group(2))) // 2
        else:
            weeks = int(week_match.group(1))

    # Extract days
    if day_match:
        if day_match.group(2):
            days = (int(day_match.group(1)) + int(day_match.group(2))) // 2
        else:
            days = int(day_match.group(1))

    # Extract hours
    if hour_match:
        if hour_match.group(2):
            hours = (int(hour_match.group(1)) + int(hour_match.group(2))) // 2
        else:
            hours = int(hour_match.group(1))

    # Extract minutes
    if minute_match:
        if minute_match.group(2):
            minutes = (int(minute_match.group(1)) + int(minute_match.group(2))) // 2
        else:
            minutes = int(minute_match.group(1))

    # Convert everything to total hours
    total_hours = weeks * 7 * 24 + days * 24 + hours + minutes / 60
    return total_hours


In [15]:
# Import the necessary library for cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from tqdm import tqdm


# Load the dataset
df = pd.read_csv("./data/raw/course/courses_103190.csv")

# Rename columns for consistency
map_cols = {
    "course_id": "_id",
    "course_name": "name",
    "description": "description",
    "duration": "duration",
    "cost": "pricing",
    "course_level": "level",
    "course_provider": "provider",
    "course_num_rating": "num_rating",
    "course_avg_rating": "avg_rating",
    "rating": "rating",
    "language": "language",
    "course_certificate": "certificate",
    "course_subject": "subject",
    "course_type": "type",
}

df = df.rename(columns=map_cols)
df = df[list(map_cols.values())]
df = df[df["language"]=="English"]
df["duration_hrs"] = df["duration"].apply(parse_time_string)
# List of skills
skills = [
    "Probability",
    "Statistics",
    "Linear Algebra",
    "Programming",
    "Machine Learning",
    "NLP tasks",
    "Topic modelling",
    "Entity Extraction",
    "Summarization",
    "Sentiment analysis",
    "Object detection",
    "Image segmentation",
    "Image classification",
    "AWS",
    "Azure",
    "Google Cloud",
    "Cloud AI services",
    "Cloud AI tools",
    "Python",
    "R",
    "TensorFlow",
    "PyTorch",
    "scikit-learn",
    "MLOps",
]

courses = (df["name"] + "\t" + df["description"].apply(lambda x: str(x))).tolist()

# Initialize the Sentence Transformer model
model = SentenceTransformer("paraphrase-MiniLM-L12-v2")

# Create embeddings for skills and courses
embedding_skills = model.encode(skills)
embedding_courses = model.encode(courses, show_progress_bar=True)

# Calculate cosine similarity between courses and skills
cosine_similarities = cosine_similarity(embedding_courses, embedding_skills)

# Create a DataFrame to hold the similarity values
similarity_df = pd.DataFrame(cosine_similarities, index=df["_id"], columns=skills)



In [3]:
similarity_df

Unnamed: 0_level_0,Probability,Statistics,Linear Algebra,Programming,Machine Learning,NLP tasks,Topic modelling,Entity Extraction,Summarization,Sentiment analysis,...,Azure,Google Cloud,Cloud AI services,Cloud AI tools,Python,R,TensorFlow,PyTorch,scikit-learn,MLOps
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16964,0.046243,0.052803,0.092629,0.238052,0.193410,0.142622,0.164363,0.168618,0.143743,0.123244,...,0.215597,0.253223,0.278057,0.325057,0.073801,0.191646,0.263904,0.030842,0.285992,0.285668
16965,0.078303,0.037344,-0.025802,0.231309,0.159278,0.196364,0.176942,0.218532,0.102102,0.103776,...,0.286106,0.282791,0.343709,0.329572,0.068170,0.204485,0.129056,-0.041775,0.251996,0.217260
96035,0.110474,0.118538,0.043969,0.184454,0.176149,0.135331,0.164845,0.134510,0.086169,0.027110,...,0.135067,0.179508,0.221086,0.181136,0.100894,0.060065,0.058832,-0.072400,0.134186,0.145416
14116,0.006468,0.065463,0.077223,0.104829,0.173692,0.125584,0.182990,0.124732,0.091239,0.082625,...,0.182179,0.232043,0.239296,0.273403,0.001521,0.124969,0.107553,0.010325,0.281036,0.202789
80344,0.076165,0.053955,0.100896,0.206857,0.234960,0.084314,0.090983,0.136181,0.038919,0.004611,...,0.073299,0.073588,0.106372,0.142606,0.139232,0.069151,0.154795,0.030704,0.277086,0.176920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64374,0.069609,0.244681,0.222178,0.138195,0.340931,0.197013,0.248119,0.149336,0.131680,0.029253,...,0.144465,0.009683,0.196372,0.221038,0.059853,0.051752,0.196292,0.112593,0.246672,0.175733
22552,0.071972,0.186388,0.134653,0.191668,0.296778,0.148465,0.321477,0.062988,0.122438,0.101862,...,-0.015546,-0.024055,0.141865,0.138414,0.031106,0.034493,0.171200,0.033214,0.261656,0.145449
22370,0.083758,0.137928,0.050027,0.246051,0.278765,0.206113,0.253445,0.136527,0.164330,-0.006870,...,0.034893,0.005900,0.112713,0.169742,-0.028461,0.084573,0.105854,0.110709,0.343326,0.185010
41095,-0.052356,0.144755,0.109839,0.200179,0.233660,0.216466,0.221133,0.054992,0.178452,0.152518,...,0.044125,0.092736,0.150042,0.221909,0.031404,0.083923,0.195880,0.064901,0.247196,0.110102


In [25]:
# For each skill, get the most similar courses
def get_top_courses_for_skill(similarity_df, skill, top_n=5):
    # Sort courses by similarity to the given skill in descending order
    sorted_courses = similarity_df[skill].sort_values(ascending=False)
    # Get the top n courses for this skill
    return sorted_courses.head(top_n)

def get_courses_with_threshold_for_skill(similarity_df, skill, threshold):
    filtered_courses = similarity_df[similarity_df[skill]>threshold]
    return filtered_courses[skill].sort_values(ascending=False)

# Example: Get the top 5 courses for each skill
top_courses_per_skill = {}
for skill in skills:
    top_courses_per_skill[skill] = get_courses_with_threshold_for_skill(
        similarity_df, skill, 0.5
    )


In [26]:
# Display the top courses for each skill
for skill, top_courses in top_courses_per_skill.items():
    print(f"Top courses for skill '{skill}':")
    for course_id in top_courses.index:
        course = df[df["_id"] == course_id]
        course_name = course["name"].values[0]
        similarity_score = top_courses[course_id]
        hrs = course["duration_hrs"].values[0]
        if hrs>1 and hrs<60*240:
            print(f"{hrs}  - {course_name} (Similarity Score: {similarity_score:.4f})")
    print("\n")

# Optionally, save the similarity DataFrame to a CSV file for further analysis
similarity_df.to_csv("./data/processed/course_skill_similarity.csv")

Top courses for skill 'Probability':


Top courses for skill 'Statistics':
12.0  - Statistics & Probability for Data Science (Similarity Score: 0.5094)


Top courses for skill 'Linear Algebra':
6.083333333333333  - Linear Algebra for Machine Learning and Generative AI (Similarity Score: 0.6416)
4.0  - Linear Algebra for Data Science in R (Similarity Score: 0.6118)
2016.0  - Applied Linear Algebra in AI and ML (Similarity Score: 0.5916)
2016.0  - Applied Linear Algebra for Signal Processing, Data Analytics and Machine Learning (Similarity Score: 0.5600)
672.0  - First Steps in Linear Algebra for Machine Learning (Similarity Score: 0.5565)
2.0  - Introduction to Linear Algebra with MATLAB (Similarity Score: 0.5433)
1.35  - Machine Learning Foundations: Linear Algebra (Similarity Score: 0.5071)


Top courses for skill 'Programming':
5.95  - The Self-Taught Programmer (Similarity Score: 0.5659)
20.0  - Break Away: Programming And Coding Interviews (Similarity Score: 0.5599)
1013.0  - Parad