# Dataset Merging & Cleaning Pipeline

This notebook merges two datasets:
1. **Udemy Courses** (`udemy_online_education_courses_dataset.csv`)
2. **Coursera Courses** (`coursera_course_dataset_v2_no_null.csv`)

The goal is to create a unified `merged_courses.csv` for the LearnMate AI++ Recommendation Engine.

In [None]:
import pandas as pd
import re

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

## 1. Load Datasets

In [None]:
try:
    udemy_df = pd.read_csv('udemy_online_education_courses_dataset.csv')
    coursera_df = pd.read_csv('coursera_course_dataset_v2_no_null.csv')
    print("Datasets loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading datasets: {e}")

## 2. Standardize Udemy Data

In [None]:
udemy_df = udemy_df[['course_title', 'url', 'is_paid', 'price', 'level', 'subject']].copy()
udemy_df.rename(columns={'course_title': 'title'}, inplace=True)
udemy_df['platform'] = 'Udemy'
udemy_df['content'] = udemy_df['title'].apply(clean_text) + " " + udemy_df['level'].apply(clean_text) + " " + udemy_df['subject'].apply(clean_text)

print(f"Udemy processed: {len(udemy_df)} rows")

## 3. Standardize Coursera Data

In [None]:
# Mapping columns: Title -> title, Organization -> inferred in content
coursera_df = coursera_df[['Title', 'Organization', 'Skills', 'Ratings']].copy()
coursera_df.rename(columns={'Title': 'title'}, inplace=True)

# Construct missing fields
coursera_df['url'] = coursera_df['title'].apply(lambda x: f"https://www.coursera.org/search?query={x.replace(' ', '%20')}")
coursera_df['is_paid'] = 'Free/Paid' # Coursera is mixed
coursera_df['price'] = 0.0 # Placeholder
coursera_df['level'] = 'All Levels' # Default
coursera_df['platform'] = 'Coursera'

# Create Content for TF-IDF
coursera_df['content'] = coursera_df['title'].apply(clean_text) + " " + coursera_df['Organization'].apply(clean_text) + " " + coursera_df['Skills'].apply(clean_text)

# Select final columns
coursera_df = coursera_df[['title', 'url', 'is_paid', 'price', 'level', 'platform', 'content']]

print(f"Coursera processed: {len(coursera_df)} rows")

## 4. Merge and Save

In [None]:
merged_df = pd.concat([udemy_df, coursera_df], ignore_index=True)

# Fill NaNs
merged_df['price'] = merged_df['price'].fillna(0.0)
merged_df['is_paid'] = merged_df['is_paid'].fillna('Free')

print(f"Total merged rows: {len(merged_df)}")

# Save
merged_df.to_csv('merged_courses.csv', index=False)
print("Saved to merged_courses.csv")