In [1]:
import re
from datetime import timedelta

def parse_time_string(time_str):
    if "On-Demand" in time_str:
        return 0  # or some default value like a specific number of hours if you prefer

    weeks = days = hours = minutes = 0

    # Regular expressions for matching different time components
    week_match = re.search(r"(\d+)-?(\d+)?\s*weeks?", time_str)
    day_match = re.search(r"(\d+)-?(\d+)?\s*days?", time_str)
    hour_match = re.search(r"(\d+)-?(\d+)?\s*hours?", time_str)
    minute_match = re.search(r"(\d+)-?(\d+)?\s*minutes?", time_str)

    # Extract weeks
    if week_match:
        if week_match.group(2):
            weeks = (int(week_match.group(1)) + int(week_match.group(2))) // 2
        else:
            weeks = int(week_match.group(1))

    # Extract days
    if day_match:
        if day_match.group(2):
            days = (int(day_match.group(1)) + int(day_match.group(2))) // 2
        else:
            days = int(day_match.group(1))

    # Extract hours
    if hour_match:
        if hour_match.group(2):
            hours = (int(hour_match.group(1)) + int(hour_match.group(2))) // 2
        else:
            hours = int(hour_match.group(1))

    # Extract minutes
    if minute_match:
        if minute_match.group(2):
            minutes = (int(minute_match.group(1)) + int(minute_match.group(2))) // 2
        else:
            minutes = int(minute_match.group(1))

    # Convert everything to total hours
    total_hours = weeks * 7 * 24 + days * 24 + hours + minutes / 60
    return total_hours


In [3]:
# Import the necessary library for cosine similarity
import pandas as pd


# Load the dataset
df = pd.read_csv("/home/sigmoid/developer/ml_projects/personalcoursebuilder/data/raw/course/courses_103190.csv")

# Rename columns for consistency
map_cols = {
    "course_id": "_id",
    "course_name": "name",
    "description": "description",
    "duration": "duration",
    "cost": "pricing",
    "course_level": "level",
    "course_provider": "provider",
    "course_num_rating": "num_rating",
    "course_avg_rating": "avg_rating",
    "rating": "rating",
    "language": "language",
    "course_certificate": "certificate",
    "course_subject": "subject",
    "course_type": "type",
}

df = df.rename(columns=map_cols)
df = df[list(map_cols.values())]
df = df[df["language"]=="English"]
df["duration_hrs"] = df["duration"].apply(parse_time_string)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 67399 entries, 0 to 103030
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   _id           67399 non-null  int64  
 1   name          67399 non-null  object 
 2   description   67399 non-null  object 
 3   duration      67399 non-null  object 
 4   pricing       66031 non-null  object 
 5   level         15368 non-null  object 
 6   provider      67399 non-null  object 
 7   num_rating    67399 non-null  int64  
 8   avg_rating    67399 non-null  float64
 9   rating        67399 non-null  object 
 10  language      67399 non-null  object 
 11  certificate   67399 non-null  int64  
 12  subject       67399 non-null  object 
 13  type          67399 non-null  object 
 14  duration_hrs  67399 non-null  float64
dtypes: float64(2), int64(3), object(10)
memory usage: 8.2+ MB


In [11]:
df["provider"].unique()

array(['edX', 'OpenLearn', 'Swayam', 'Cybrary', 'Udacity', 'freeCodeCamp',
       'Udemy', 'Coursera', 'YouTube', 'Pluralsight', 'LinkedIn Learning',
       'FutureLearn', 'Independent', 'MIT OpenCourseWare',
       'Saylor Academy', 'Skillshare', 'Cisco Networking Academy',
       'Trailhead', 'egghead.io', 'AWS Skill Builder', 'Microsoft Learn',
       'Frontend Masters', 'Study.com', 'MasterClass',
       'California Community Colleges System', 'Codecademy', 'Kadenze',
       'openSAP', 'Cognitive Class', 'Edureka', 'Treehouse', 'Scrimba',
       'Laracasts', 'The Odin Project', 'openHPI', 'Open2Study',
       'iversity', 'Polimi OPEN KNOWLEDGE', 'OpenLearning', 'Kaggle',
       'Exercism', 'HubSpot Academy', 'Domestika',
       'Google Cloud Skills Boost', 'Canvas Network', 'Zero To Mastery',
       'CreativeLive', 'DataCamp', 'Test Automation University',
       'SymfonyCasts', 'The Great Courses Plus', 'TryHackMe',
       'PentesterAcademy', 'ThaiMOOC', 'Wolfram U', 'A Cloud Guru