### IMPORT LIBRARY

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.cluster import KMeans
import string
import time
import re
import nltk
import torch
from tqdm import tqdm
import warnings 
warnings.filterwarnings("ignore", category=UserWarning)

### READ DATASET

In [2]:
test_cases_file = '/kaggle/input/course-recommendation/test_case_course.csv'
test_cases_df = pd.read_csv(test_cases_file)
course_data_file = '/kaggle/input/course-recommendation/Online_Courses.csv'

# TF-IDF

In [18]:
def preprocess_text_simple(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    return text.strip()

def load_and_preprocess_course_data(file_path):
    df = pd.read_csv(file_path)
    df.drop(columns=['Unnamed: 0', 'Program Type', 'Courses', 'Level', 'Number of Reviews',
           'Unique Projects', 'Prequisites', 'What you learn', 'Related Programs',
           'Monthly access', '6-Month access', '4-Month access', '3-Month access',
           '5-Month access', '2-Month access', 'School', 'Topics related to CRM',
           'ExpertTracks', 'FAQs', 'Course Title', 'Course URL',
           'Course Short Intro', 'Weekly study', 'Premium course',
           "What's include", 'Rank', 'Created by', 'Program', 'Number of ratings',
           'Price', 'COURSE CATEGORIES'], inplace=True)

    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df = df.drop_duplicates(subset=['Title', 'Short Intro'])

    translations = {
        '计算机科学': 'Computer Science',
        'Ciencia de Datos': 'Data Science',
        'Negocios': 'Business',
        'Ciencias de la Computación': 'Computer Science',
        'Negócios': 'Business',
        'データサイエンス': 'Data Science',
        'Tecnologia da informação': 'Information Technology'
    }
    df['Category'] = df['Category'].replace(translations)

    df['Rating'] = df['Rating'].str.replace('stars', '', regex=False)
    df['Number of viewers'] = df['Number of viewers'].str.replace(r'\D+', '', regex=True)

    df['combined'] = df['Title'] + ' ' + df['Short Intro'].fillna('') + ' ' + df['Skills'].fillna('') + ' ' + df['Category'].fillna('') + ' ' + df['Sub-Category'].fillna('')
    df['combined'] = df['combined'].apply(preprocess_text_simple)

    df['Number of viewers'] = pd.to_numeric(df['Number of viewers'], errors='coerce').fillna(0)
    df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce').fillna(0)

    return df

def vectorize_text(df):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df['combined'])
    return vectorizer, tfidf_matrix

def imdb_score(df, q=0.95):
    df = df.copy()
    m = df['Number of viewers'].quantile(q)
    c = (df['Rating'] * df['Number of viewers']).sum() / df['Number of viewers'].sum()
    df["score"] = df.apply(lambda x: (x.Rating * x['Number of viewers'] + c*m) / (x['Number of viewers'] + m), axis=1)
    return df

def recommend(user_input, df, vectorizer, tfidf_matrix):
    user_input_processed = preprocess_text_simple(user_input)
    user_tfidf = vectorizer.transform([user_input_processed])

    cosine_similarities = cosine_similarity(user_tfidf, tfidf_matrix).flatten()

    top_course_indices = cosine_similarities.argsort()[::-1]
    
    recommendations = df.iloc[top_course_indices].copy()
    recommendations['cosine_similarity'] = cosine_similarities[top_course_indices]
    
    percentile_threshold = 95
    threshold_value = np.percentile(recommendations['cosine_similarity'], percentile_threshold)
    stage1 = recommendations[recommendations['cosine_similarity'] >= threshold_value]
    
    stage2 = imdb_score(stage1)
    
    stage2['score'] = (stage2['score'] - stage2['score'].min()) / (stage2['score'].max() - stage2['score'].min())
    stage2['cosine_similarity'] = (stage2['cosine_similarity'] - stage2['cosine_similarity'].min()) / (stage2['cosine_similarity'].max() - stage2['cosine_similarity'].min())

    stage2['Final'] = 0.5 * stage2['cosine_similarity'] + 0.5 * stage2['score']
    stage2 = stage2.sort_values(by='Final', ascending=False)

    return stage2

def run_recommendation_for_test_cases(course_data_file, test_cases_df, num_cases):
    df = load_and_preprocess_course_data(course_data_file)
    vectorizer, tfidf_matrix = vectorize_text(df)

    total_duration = 0

    for i in range(num_cases):
        user_input = test_cases_df['Case'].iloc[i]
        start_time = time.time()

        recommendations = recommend(user_input, df, vectorizer, tfidf_matrix)

        end_time = time.time()
        duration = end_time - start_time
        total_duration += duration

        print(f"\nRecommendations for Test Case {test_cases_df['No'].iloc[i]}:")
        if recommendations is not None and not recommendations.empty:
            for idx, row in recommendations.head(10).iterrows():
                print(f"Title: {row['Title']} | Final Score: {row['Final']:.4f}")
        else:
            print("No relevant courses found.")
        
        print(f"Execution Time: {duration:.4f} seconds")

    print(f"\nTotal Duration for {num_cases} User Preferences: {total_duration:.4f} seconds")

In [19]:
print("Running recommendation for 1 user preference:")
run_recommendation_for_test_cases(course_data_file, test_cases_df, 1)

Running recommendation for 1 user preference:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



Recommendations for Test Case 1:
Title: Machine Learning Engineering for Production (MLOps) Specialization | Final Score: 0.7546
Title: Machine Learning Introduction for Everyone | Final Score: 0.7322
Title: Introduction to Machine Learning in Production | Final Score: 0.7240
Title: Improving Deep Neural Networks: Hyperparameter Tuning, Regularization and Optimization | Final Score: 0.7037
Title: Data Engineering for Data Scientists | Final Score: 0.6709
Title: AI for Medicine Specialization | Final Score: 0.6696
Title: How to Become a Machine Learning Engineer | Final Score: 0.6428
Title: Machine Learning Foundations for Product Managers | Final Score: 0.6369
Title: Deep Learning Specialization | Final Score: 0.6259
Title: Microsoft Azure Machine Learning | Final Score: 0.6237
Execution Time: 0.0203 seconds

Total Duration for 1 User Preferences: 0.0203 seconds


In [5]:
print("\nRunning recommendation for 5 user preferences:")
run_recommendation_for_test_cases(course_data_file, test_cases_df, 5)


Running recommendation for 5 user preferences:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



Recommendations for Test Case 1:
Title: Machine Learning Engineering for Production (MLOps) Specialization | Final Score: 0.7546
Title: Machine Learning Introduction for Everyone | Final Score: 0.7322
Title: Introduction to Machine Learning in Production | Final Score: 0.7240
Title: Improving Deep Neural Networks: Hyperparameter Tuning, Regularization and Optimization | Final Score: 0.7037
Title: Data Engineering for Data Scientists | Final Score: 0.6709
Title: AI for Medicine Specialization | Final Score: 0.6696
Title: How to Become a Machine Learning Engineer | Final Score: 0.6428
Title: Machine Learning Foundations for Product Managers | Final Score: 0.6369
Title: Deep Learning Specialization | Final Score: 0.6259
Title: Microsoft Azure Machine Learning | Final Score: 0.6237
Execution Time: 0.0201 seconds

Recommendations for Test Case 2:
Title: Customer Experience Design for Customer Success: Ensuring Customer Success | Final Score: 0.7995
Title: Business and Marketing Strategies 

In [6]:
print("\nRunning recommendation for 10 user preferences:")
run_recommendation_for_test_cases(course_data_file, test_cases_df, 10)


Running recommendation for 10 user preferences:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



Recommendations for Test Case 1:
Title: Machine Learning Engineering for Production (MLOps) Specialization | Final Score: 0.7546
Title: Machine Learning Introduction for Everyone | Final Score: 0.7322
Title: Introduction to Machine Learning in Production | Final Score: 0.7240
Title: Improving Deep Neural Networks: Hyperparameter Tuning, Regularization and Optimization | Final Score: 0.7037
Title: Data Engineering for Data Scientists | Final Score: 0.6709
Title: AI for Medicine Specialization | Final Score: 0.6696
Title: How to Become a Machine Learning Engineer | Final Score: 0.6428
Title: Machine Learning Foundations for Product Managers | Final Score: 0.6369
Title: Deep Learning Specialization | Final Score: 0.6259
Title: Microsoft Azure Machine Learning | Final Score: 0.6237
Execution Time: 0.0207 seconds

Recommendations for Test Case 2:
Title: Customer Experience Design for Customer Success: Ensuring Customer Success | Final Score: 0.7995
Title: Business and Marketing Strategies 

In [7]:
print("\nRunning recommendation for 20 user preferences:")
run_recommendation_for_test_cases(course_data_file, test_cases_df, 20)


Running recommendation for 20 user preferences:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



Recommendations for Test Case 1:
Title: Machine Learning Engineering for Production (MLOps) Specialization | Final Score: 0.7546
Title: Machine Learning Introduction for Everyone | Final Score: 0.7322
Title: Introduction to Machine Learning in Production | Final Score: 0.7240
Title: Improving Deep Neural Networks: Hyperparameter Tuning, Regularization and Optimization | Final Score: 0.7037
Title: Data Engineering for Data Scientists | Final Score: 0.6709
Title: AI for Medicine Specialization | Final Score: 0.6696
Title: How to Become a Machine Learning Engineer | Final Score: 0.6428
Title: Machine Learning Foundations for Product Managers | Final Score: 0.6369
Title: Deep Learning Specialization | Final Score: 0.6259
Title: Microsoft Azure Machine Learning | Final Score: 0.6237
Execution Time: 0.0234 seconds

Recommendations for Test Case 2:
Title: Customer Experience Design for Customer Success: Ensuring Customer Success | Final Score: 0.7995
Title: Business and Marketing Strategies 

In [8]:
print("\nRunning recommendation for 50 user preferences:")
run_recommendation_for_test_cases(course_data_file, test_cases_df, 50)


Running recommendation for 50 user preferences:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



Recommendations for Test Case 1:
Title: Machine Learning Engineering for Production (MLOps) Specialization | Final Score: 0.7546
Title: Machine Learning Introduction for Everyone | Final Score: 0.7322
Title: Introduction to Machine Learning in Production | Final Score: 0.7240
Title: Improving Deep Neural Networks: Hyperparameter Tuning, Regularization and Optimization | Final Score: 0.7037
Title: Data Engineering for Data Scientists | Final Score: 0.6709
Title: AI for Medicine Specialization | Final Score: 0.6696
Title: How to Become a Machine Learning Engineer | Final Score: 0.6428
Title: Machine Learning Foundations for Product Managers | Final Score: 0.6369
Title: Deep Learning Specialization | Final Score: 0.6259
Title: Microsoft Azure Machine Learning | Final Score: 0.6237
Execution Time: 0.0204 seconds

Recommendations for Test Case 2:
Title: Customer Experience Design for Customer Success: Ensuring Customer Success | Final Score: 0.7995
Title: Business and Marketing Strategies 

# WORD2VEC

In [33]:
def preprocess_text_simple(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    return text.strip()

def load_and_preprocess_course_data(file_path):
    df = pd.read_csv(file_path)
    df.drop(columns=['Unnamed: 0','Program Type', 'Courses', 'Level', 'Number of Reviews',
           'Unique Projects', 'Prequisites', 'What you learn', 'Related Programs',
           'Monthly access', '6-Month access', '4-Month access', '3-Month access',
           '5-Month access', '2-Month access', 'School', 'Topics related to CRM',
           'ExpertTracks', 'FAQs', 'Course Title', 'Course URL',
           'Course Short Intro', 'Weekly study', 'Premium course',
           "What's include", 'Rank', 'Created by', 'Program', 'Number of ratings',
           'Price', 'COURSE CATEGORIES'], inplace=True)
    
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df = df.drop_duplicates(subset=['Title', 'Short Intro'])
    
    translations = {
        '计算机科学': 'Computer Science',
        'Ciencia de Datos': 'Data Science',
        'Negocios': 'Business',
        'Ciencias de la Computación': 'Computer Science',
        'Negócios': 'Business',
        'データサイエンス': 'Data Science',
        'Tecnologia da informação': 'Information Technology'
    }
    df['Category'] = df['Category'].replace(translations)
    
    df['Rating'] = df['Rating'].str.replace('stars', '', regex=False)
    df['Number of viewers'] = df['Number of viewers'].str.replace(r'\D+', '', regex=True)
    
    df['combined'] = df['Title'] + ' ' + df['Short Intro'].fillna('') + ' ' + df['Skills'].fillna('') + ' ' + df['Category'].fillna('') + ' ' + df['Sub-Category'].fillna('')
    df['combined'] = df['combined'].apply(preprocess_text_simple)
    
    df['Number of viewers'] = pd.to_numeric(df['Number of viewers'], errors='coerce').fillna(0).astype(int)
    df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce').fillna(0)
    
    df['Tokenized'] = df['combined'].apply(word_tokenize)
    
    return df

def train_word2vec(df):
    model = Word2Vec(sentences=df['Tokenized'], vector_size=100, window=5, min_count=1, workers=4)
    return model

def get_document_vector(doc, model):
    words = word_tokenize(doc)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

def vectorize_text(df, model):
    doc_vectors = np.array([get_document_vector(doc, model) for doc in df['combined']])
    return doc_vectors

def imdb_score(df, q=0.95):
    df = df.copy()
    m = df['Number of viewers'].quantile(q)
    c = (df['Rating'] * df['Number of viewers']).sum() / df['Number of viewers'].sum()
    df["score"] = df.apply(lambda x: (x.Rating * x['Number of viewers'] + c*m) / (x['Number of viewers'] + m), axis=1)
    return df

def recommend_course(user_input, df, model, doc_vectors):
    user_input_processed = preprocess_text_simple(user_input)
    user_vector = get_document_vector(user_input_processed, model)
    
    cosine_similarities = cosine_similarity([user_vector], doc_vectors).flatten()
    
    df_temp = df.copy()
    df_temp['cosine_similarity'] = cosine_similarities
    
    percentile_threshold = 95
    threshold_value = np.percentile(df_temp['cosine_similarity'], percentile_threshold)
    stage1 = df_temp[df_temp['cosine_similarity'] >= threshold_value]
    
    stage2 = imdb_score(stage1)
    stage2['score'] = (stage2['score'] - stage2['score'].min()) / (stage2['score'].max() - stage2['score'].min())
    stage2['cosine_similarity'] = (stage2['cosine_similarity'] - stage2['cosine_similarity'].min()) / (stage2['cosine_similarity'].max() - stage2['cosine_similarity'].min())
    
    stage2['Final'] = 0.5 * stage2['cosine_similarity'] + 0.5 * stage2['score']
    stage2 = stage2.sort_values(by='Final', ascending=False)
    
    return stage2

def run_recommendation_for_test_cases(course_data_file, test_cases_df, num_cases, model, doc_vectors):
    df = load_and_preprocess_course_data(course_data_file)

    total_duration = 0

    for i in range(num_cases):
        user_input = test_cases_df['Case'].iloc[i]
        start_time = time.time()

        recommendations = recommend_course(user_input, df, model, doc_vectors)

        end_time = time.time()
        duration = end_time - start_time
        total_duration += duration

        print(f"\nRecommendations for Test Case {test_cases_df['No'].iloc[i]}:")
        if not recommendations.empty:
            for idx, row in recommendations.head(10).iterrows():
                print(f"Title: {row['Title']} | Final Score: {row['Final']:.4f}")
        else:
            print("No relevant courses found.")
        
        print(f"Execution Time: {duration:.4f} seconds")

    print(f"\nTotal Duration for {num_cases} User Preferences: {total_duration:.4f} seconds")

df = load_and_preprocess_course_data(course_data_file)
word2vec_model = train_word2vec(df)
doc_vectors = vectorize_text(df, word2vec_model)

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [37]:
print("Running recommendation for 1 user preference:")
run_recommendation_for_test_cases(course_data_file, test_cases_df, 1, word2vec_model, doc_vectors)

Running recommendation for 1 user preference:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



Recommendations for Test Case 1:
Title: Convolutional Neural Networks | Final Score: 0.8508
Title: Improving Deep Neural Networks: Hyperparameter Tuning, Regularization and Optimization | Final Score: 0.7825
Title: Machine Learning Modeling Pipelines in Production | Final Score: 0.7331
Title: Introduction to Machine Learning in Production | Final Score: 0.7287
Title: Advanced Data Science Capstone | Final Score: 0.7127
Title: AI Product Management Specialization | Final Score: 0.6643
Title: Become a Natural Language Processing Expert | Final Score: 0.6604
Title: Optimize ML Models and Deploy Human-in-the-Loop Pipelines | Final Score: 0.6551
Title: Introduction to Artificial Intelligence and Machine Learning | Final Score: 0.6510
Title: Introduction to Artificial Intelligence (AI) | Final Score: 0.6311
Execution Time: 0.0313 seconds

Total Duration for 1 User Preferences: 0.0313 seconds


In [11]:
print("\nRunning recommendation for 5 user preferences:")
run_recommendation_for_test_cases(course_data_file, test_cases_df, 5, word2vec_model, doc_vectors)


Running recommendation for 5 user preferences:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



Recommendations for Test Case 1:
Title: Convolutional Neural Networks | Final Score: 0.8515
Title: Improving Deep Neural Networks: Hyperparameter Tuning, Regularization and Optimization | Final Score: 0.8150
Title: Introduction to Machine Learning in Production | Final Score: 0.7985
Title: Machine Learning Modeling Pipelines in Production | Final Score: 0.7607
Title: Optimize ML Models and Deploy Human-in-the-Loop Pipelines | Final Score: 0.7223
Title: Advanced Data Science Capstone | Final Score: 0.7192
Title: Introduction to Artificial Intelligence and Machine Learning | Final Score: 0.6884
Title: AI Product Management Specialization | Final Score: 0.6872
Title: Ethical AI | Final Score: 0.6725
Title: Become a Natural Language Processing Expert | Final Score: 0.6647
Execution Time: 0.0312 seconds

Recommendations for Test Case 2:
Title: Mergers and Acquisitions  Specialization | Final Score: 0.7844
Title: Foundations of Marketing: How to Build a Modern Marketing Plan | Final Score: 

In [12]:
print("\nRunning recommendation for 10 user preferences:")
run_recommendation_for_test_cases(course_data_file, test_cases_df, 10, word2vec_model, doc_vectors)


Running recommendation for 10 user preferences:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



Recommendations for Test Case 1:
Title: Convolutional Neural Networks | Final Score: 0.8515
Title: Improving Deep Neural Networks: Hyperparameter Tuning, Regularization and Optimization | Final Score: 0.8150
Title: Introduction to Machine Learning in Production | Final Score: 0.7985
Title: Machine Learning Modeling Pipelines in Production | Final Score: 0.7607
Title: Optimize ML Models and Deploy Human-in-the-Loop Pipelines | Final Score: 0.7223
Title: Advanced Data Science Capstone | Final Score: 0.7192
Title: Introduction to Artificial Intelligence and Machine Learning | Final Score: 0.6884
Title: AI Product Management Specialization | Final Score: 0.6872
Title: Ethical AI | Final Score: 0.6725
Title: Become a Natural Language Processing Expert | Final Score: 0.6647
Execution Time: 0.0316 seconds

Recommendations for Test Case 2:
Title: Mergers and Acquisitions  Specialization | Final Score: 0.7844
Title: Foundations of Marketing: How to Build a Modern Marketing Plan | Final Score: 

In [13]:
print("\nRunning recommendation for 20 user preferences:")
run_recommendation_for_test_cases(course_data_file, test_cases_df, 20, word2vec_model, doc_vectors)


Running recommendation for 20 user preferences:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



Recommendations for Test Case 1:
Title: Convolutional Neural Networks | Final Score: 0.8515
Title: Improving Deep Neural Networks: Hyperparameter Tuning, Regularization and Optimization | Final Score: 0.8150
Title: Introduction to Machine Learning in Production | Final Score: 0.7985
Title: Machine Learning Modeling Pipelines in Production | Final Score: 0.7607
Title: Optimize ML Models and Deploy Human-in-the-Loop Pipelines | Final Score: 0.7223
Title: Advanced Data Science Capstone | Final Score: 0.7192
Title: Introduction to Artificial Intelligence and Machine Learning | Final Score: 0.6884
Title: AI Product Management Specialization | Final Score: 0.6872
Title: Ethical AI | Final Score: 0.6725
Title: Become a Natural Language Processing Expert | Final Score: 0.6647
Execution Time: 0.0305 seconds

Recommendations for Test Case 2:
Title: Mergers and Acquisitions  Specialization | Final Score: 0.7844
Title: Foundations of Marketing: How to Build a Modern Marketing Plan | Final Score: 

In [14]:
print("\nRunning recommendation for 50 user preferences:")
run_recommendation_for_test_cases(course_data_file, test_cases_df, 50, word2vec_model, doc_vectors)


Running recommendation for 50 user preferences:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



Recommendations for Test Case 1:
Title: Convolutional Neural Networks | Final Score: 0.8515
Title: Improving Deep Neural Networks: Hyperparameter Tuning, Regularization and Optimization | Final Score: 0.8150
Title: Introduction to Machine Learning in Production | Final Score: 0.7985
Title: Machine Learning Modeling Pipelines in Production | Final Score: 0.7607
Title: Optimize ML Models and Deploy Human-in-the-Loop Pipelines | Final Score: 0.7223
Title: Advanced Data Science Capstone | Final Score: 0.7192
Title: Introduction to Artificial Intelligence and Machine Learning | Final Score: 0.6884
Title: AI Product Management Specialization | Final Score: 0.6872
Title: Ethical AI | Final Score: 0.6725
Title: Become a Natural Language Processing Expert | Final Score: 0.6647
Execution Time: 0.0326 seconds

Recommendations for Test Case 2:
Title: Mergers and Acquisitions  Specialization | Final Score: 0.7844
Title: Foundations of Marketing: How to Build a Modern Marketing Plan | Final Score: 

# BERT

In [27]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def preprocess_text_simple(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    return text.strip()

def load_and_preprocess_course_data(file_path):
    df = pd.read_csv(file_path)
    df.drop(columns=['Unnamed: 0', 'Program Type', 'Courses', 'Level', 'Number of Reviews',
           'Unique Projects', 'Prequisites', 'What you learn', 'Related Programs',
           'Monthly access', '6-Month access', '4-Month access', '3-Month access',
           '5-Month access', '2-Month access', 'School', 'Topics related to CRM',
           'ExpertTracks', 'FAQs', 'Course Title', 'Course URL',
           'Course Short Intro', 'Weekly study', 'Premium course',
           "What's include", 'Rank', 'Created by', 'Program', 'Number of ratings',
           'Price', 'COURSE CATEGORIES'], inplace=True)
    
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df = df.drop_duplicates(subset=['Title', 'Short Intro'])
    
    translations = {
        '计算机科学': 'Computer Science',
        'Ciencia de Datos': 'Data Science',
        'Negocios': 'Business',
        'Ciencias de la Computación': 'Computer Science',
        'Negócios': 'Business',
        'データサイエンス': 'Data Science',
        'Tecnologia da informação': 'Information Technology'
    }
    df['Category'] = df['Category'].replace(translations)
    
    df['Rating'] = df['Rating'].str.replace('stars', '', regex=False)
    df['Number of viewers'] = df['Number of viewers'].str.replace(r'\D+', '', regex=True)
    
    df['combined'] = df['Title'] + ' ' + df['Short Intro'].fillna('') + ' ' + df['Skills'].fillna('') + ' ' + df['Category'].fillna('') + ' ' + df['Sub-Category'].fillna('')
    df['combined'] = df['combined'].apply(preprocess_text_simple)
    
    df['Number of viewers'] = pd.to_numeric(df['Number of viewers'], errors='coerce').fillna(0).astype(int)
    df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce').fillna(0)
    
    return df

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def vectorize_text(df):
    doc_vectors = []
    for doc in tqdm(df['combined'], desc="Vectorizing documents"):
        doc_vectors.append(get_bert_embedding(doc))
    return np.array(doc_vectors)

def imdb_score(df, q=0.95):
    df = df.copy()
    m = df['Number of viewers'].quantile(q)
    c = (df['Rating'] * df['Number of viewers']).sum() / df['Number of viewers'].sum()
    df["score"] = df.apply(lambda x: (x.Rating * x['Number of viewers'] + c*m) / (x['Number of viewers'] + m), axis=1)
    return df

def recommend_course(user_input, df, doc_vectors):
    user_input_processed = preprocess_text_simple(user_input)
    user_vector = get_bert_embedding(user_input_processed)
    
    cosine_similarities = cosine_similarity([user_vector], doc_vectors).flatten()
    
    df_temp = df.copy()
    df_temp['cosine_similarity'] = cosine_similarities
    
    percentile_threshold = 95
    threshold_value = np.percentile(df_temp['cosine_similarity'], percentile_threshold)
    stage1 = df_temp[df_temp['cosine_similarity'] >= threshold_value]
    
    stage2 = imdb_score(stage1)
    stage2['score'] = (stage2['score'] - stage2['score'].min()) / (stage2['score'].max() - stage2['score'].min())
    stage2['cosine_similarity'] = (stage2['cosine_similarity'] - stage2['cosine_similarity'].min()) / (stage2['cosine_similarity'].max() - stage2['cosine_similarity'].min())
    
    stage2['Final'] = 0.5 * stage2['cosine_similarity'] + 0.5 * stage2['score']
    stage2 = stage2.sort_values(by='Final', ascending=False)
    
    return stage2

def run_recommendation_for_test_cases(course_data_file, test_cases_df, num_cases, doc_vectors):
    df = load_and_preprocess_course_data(course_data_file)

    total_duration = 0

    for i in range(num_cases):
        user_input = test_cases_df['Case'].iloc[i]
        start_time = time.time()

        recommendations = recommend_course(user_input, df, doc_vectors)

        end_time = time.time()
        duration = end_time - start_time
        total_duration += duration

        print(f"\nRecommendations for Test Case {test_cases_df['No'].iloc[i]}:")
        if not recommendations.empty:
            for idx, row in recommendations.head(10).iterrows():
                print(f"Title: {row['Title']} | Final Score: {row['Final']:.4f}")
        else:
            print("No relevant courses found.")
        
        print(f"Execution Time: {duration:.4f} seconds")

    print(f"\nTotal Duration for {num_cases} User Preferences: {total_duration:.4f} seconds")

df = load_and_preprocess_course_data(course_data_file)
doc_vectors = vectorize_text(df)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
Vectorizing documents: 100%|██████████| 4988/4988 [08:42<00:00,  9.55it/s]


In [28]:
print("Running recommendation for 1 user preference:")
run_recommendation_for_test_cases(course_data_file, test_cases_df, 1, doc_vectors)

Running recommendation for 1 user preference:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



Recommendations for Test Case 1:
Title: Applied Data Science for Data Analysts | Final Score: 0.7209
Title: Advanced Data Science Capstone | Final Score: 0.6842
Title: Machine Learning Foundations for Product Managers | Final Score: 0.6810
Title: Machine Learning Modeling Pipelines in Production | Final Score: 0.6703
Title: Improving Deep Neural Networks: Hyperparameter Tuning, Regularization and Optimization | Final Score: 0.6677
Title: Introduction to Machine Learning in Production | Final Score: 0.6677
Title: Introduction to TensorFlow for Artificial Intelligence, Machine Learning, and Deep Learning | Final Score: 0.6485
Title: Deploying Machine Learning Models in Production | Final Score: 0.6405
Title: Cloud Machine Learning Engineering and MLOps | Final Score: 0.6244
Title: Web Applications and Command-Line Tools for Data Engineering | Final Score: 0.6142
Execution Time: 0.1829 seconds

Total Duration for 1 User Preferences: 0.1829 seconds


In [29]:
print("\nRunning recommendation for 5 user preferences:")
run_recommendation_for_test_cases(course_data_file, test_cases_df, 5, doc_vectors)


Running recommendation for 5 user preferences:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



Recommendations for Test Case 1:
Title: Applied Data Science for Data Analysts | Final Score: 0.7209
Title: Advanced Data Science Capstone | Final Score: 0.6842
Title: Machine Learning Foundations for Product Managers | Final Score: 0.6810
Title: Machine Learning Modeling Pipelines in Production | Final Score: 0.6703
Title: Improving Deep Neural Networks: Hyperparameter Tuning, Regularization and Optimization | Final Score: 0.6677
Title: Introduction to Machine Learning in Production | Final Score: 0.6677
Title: Introduction to TensorFlow for Artificial Intelligence, Machine Learning, and Deep Learning | Final Score: 0.6485
Title: Deploying Machine Learning Models in Production | Final Score: 0.6405
Title: Cloud Machine Learning Engineering and MLOps | Final Score: 0.6244
Title: Web Applications and Command-Line Tools for Data Engineering | Final Score: 0.6142
Execution Time: 0.1532 seconds

Recommendations for Test Case 2:
Title: Corporate Finance I: Measuring and Promoting Value Cre

In [30]:
print("\nRunning recommendation for 10 user preferences:")
run_recommendation_for_test_cases(course_data_file, test_cases_df, 10, doc_vectors)


Running recommendation for 10 user preferences:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



Recommendations for Test Case 1:
Title: Applied Data Science for Data Analysts | Final Score: 0.7209
Title: Advanced Data Science Capstone | Final Score: 0.6842
Title: Machine Learning Foundations for Product Managers | Final Score: 0.6810
Title: Machine Learning Modeling Pipelines in Production | Final Score: 0.6703
Title: Improving Deep Neural Networks: Hyperparameter Tuning, Regularization and Optimization | Final Score: 0.6677
Title: Introduction to Machine Learning in Production | Final Score: 0.6677
Title: Introduction to TensorFlow for Artificial Intelligence, Machine Learning, and Deep Learning | Final Score: 0.6485
Title: Deploying Machine Learning Models in Production | Final Score: 0.6405
Title: Cloud Machine Learning Engineering and MLOps | Final Score: 0.6244
Title: Web Applications and Command-Line Tools for Data Engineering | Final Score: 0.6142
Execution Time: 0.1465 seconds

Recommendations for Test Case 2:
Title: Corporate Finance I: Measuring and Promoting Value Cre

In [31]:
print("\nRunning recommendation for 20 user preferences:")
run_recommendation_for_test_cases(course_data_file, test_cases_df, 20, doc_vectors)


Running recommendation for 20 user preferences:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



Recommendations for Test Case 1:
Title: Applied Data Science for Data Analysts | Final Score: 0.7209
Title: Advanced Data Science Capstone | Final Score: 0.6842
Title: Machine Learning Foundations for Product Managers | Final Score: 0.6810
Title: Machine Learning Modeling Pipelines in Production | Final Score: 0.6703
Title: Improving Deep Neural Networks: Hyperparameter Tuning, Regularization and Optimization | Final Score: 0.6677
Title: Introduction to Machine Learning in Production | Final Score: 0.6677
Title: Introduction to TensorFlow for Artificial Intelligence, Machine Learning, and Deep Learning | Final Score: 0.6485
Title: Deploying Machine Learning Models in Production | Final Score: 0.6405
Title: Cloud Machine Learning Engineering and MLOps | Final Score: 0.6244
Title: Web Applications and Command-Line Tools for Data Engineering | Final Score: 0.6142
Execution Time: 0.1484 seconds

Recommendations for Test Case 2:
Title: Corporate Finance I: Measuring and Promoting Value Cre

In [32]:
print("\nRunning recommendation for 50 user preferences:")
run_recommendation_for_test_cases(course_data_file, test_cases_df, 50, doc_vectors)


Running recommendation for 50 user preferences:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



Recommendations for Test Case 1:
Title: Applied Data Science for Data Analysts | Final Score: 0.7209
Title: Advanced Data Science Capstone | Final Score: 0.6842
Title: Machine Learning Foundations for Product Managers | Final Score: 0.6810
Title: Machine Learning Modeling Pipelines in Production | Final Score: 0.6703
Title: Improving Deep Neural Networks: Hyperparameter Tuning, Regularization and Optimization | Final Score: 0.6677
Title: Introduction to Machine Learning in Production | Final Score: 0.6677
Title: Introduction to TensorFlow for Artificial Intelligence, Machine Learning, and Deep Learning | Final Score: 0.6485
Title: Deploying Machine Learning Models in Production | Final Score: 0.6405
Title: Cloud Machine Learning Engineering and MLOps | Final Score: 0.6244
Title: Web Applications and Command-Line Tools for Data Engineering | Final Score: 0.6142
Execution Time: 0.1459 seconds

Recommendations for Test Case 2:
Title: Corporate Finance I: Measuring and Promoting Value Cre