Import required packages

In [107]:
import pandas as pd
import re
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from googletrans import Translator

Read the file

In [3]:
file = pd.read_excel('shortcourses2566.xlsx')

Language convert function

In [704]:
import translators as ts

def translate_eng(text):
    return ts.translate_text(text)

def is_english(text):
    for char in text:
        if char.isalpha() and char.isascii():
            return True
    return False
            

Clean title with regular expresion

In [5]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

Create dataframe of courses

In [745]:
# Take the series of courses from dataset column
content = file['หลักสูตรอบรมระยะสั้น'].drop_duplicates().fillna('')
courses = content.sort_values().set_axis(range(0,len(content)))

# Check if the course is in Thai language or not
is_english_courses = courses.apply(is_english)
thai_courses_not_trans = courses[is_english_courses == False]

# Translate courses in a thai language to en english language
thai_courses = thai_courses_not_trans.apply(translate_eng)
english_courses = courses[is_english_courses == True]

# Combine 2 series into a single series
combined_courses = thai_courses._append(english_courses)
combined_courses.sort_index()

# Convert combined courses to be in form of regular expression
courses_clean = combined_courses.apply(clean_title)

Create tfidf matrix

In [198]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(courses_clean)

Create cosine similarities

In [8]:
cosine_similarities = linear_kernel(tfidf_matrix)

Create recommendation system function

In [91]:
def recommender_tfidf(course_name, limit):
    """Return a dataframe of content recommendations based on TF-IDF cosine similarity.
    
    Args:
        courses (object): Pandas Series containing the text data. 
        column (string): Name of column used, i.e. 'title'. 
        course_name (string): Name of title to get recommendations for, i.e. 1982 Ferrari 308 GTSi For Sale by Auction
        cosine_similarities (array): Cosine similarities matrix from linear_kernel
        limit (int, optional): Optional limit on number of recommendations to return. 
        
    Returns: 
        Pandas dataframe. 
    """

    # Return indices for the target dataframe column and drop any duplicates
    indices = pd.Series(courses).drop_duplicates()
    
    # Get the index for the target course_name
    count = 0
    for name in indices:
        if name == course_name:
            break
        else:
            count = count + 1
    target_index = count

    # Get the cosine similarity scores for the target course_name
    cosine_similarity_scores = list(enumerate(cosine_similarities[target_index]))
    
    # Sort the cosine similarities in order of closest similarity
    cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Return tuple of the requested closest scores excluding the target item and index
    cosine_similarity_scores = cosine_similarity_scores[1:limit+1]
    
    # Extract the tuple course_names
    index = (x[0] for x in cosine_similarity_scores)
    scores = (x[1] for x in cosine_similarity_scores)
    
    # Get the indices for the closest items
    recommendation_indices = [i[0] for i in cosine_similarity_scores]
    
    # Get the actual recommendations
    recommendations = courses.iloc[recommendation_indices]
    
    # Return a recommendations
    recommendations = pd.DataFrame(tuple(zip(index, recommendations, scores)),
                      columns=['Index','Course', 'Cosine Similarity Score'])
    
    # Take index from column 'index'
    idx = recommendations['Index']
    
    # Set and sort index
    recommendations = recommendations.set_axis(idx).drop(columns='Index')
    return recommendations

Test recommendation system using TF-IDF

In [92]:
recommender_tfidf('Cancer Epidemiology and Prevention', 10)

Unnamed: 0_level_0,Course,Cosine Similarity Score
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
132,หลักวิทยาการระบาด (Principles of Epidemiology)...,0.355453
1,Coaching Skill : Crafting a New You สำหรับนักศ...,0.0
2,Coaching Skill : Crafting a New You สำหรับบุคล...,0.0
3,Development Studies & Social Research Speciali...,0.0
4,Digital Transformation: e-Document,0.0
5,Environmental Economics (USAC programs) - Fall...,0.0
6,Global Citizenship – โรงเรียนมงฟอร์ตวิทยาลัย ส...,0.0
7,Global Citizenship – โรงเรียนมงฟอร์ตวิทยาลัย ส...,0.0
8,Global Citizenship – โรงเรียนสันป่าตองวิทยาคม ...,0.0
9,Global Citizenship – โรงเรียนแม่ริมวิทยาคม 2566,0.0


Predata for hybrid recommendation

In [12]:
def recommender_tfidf_all_courses(course_name):
    """Return a dataframe of content recommendations based on TF-IDF cosine similarity.
    
    Args:
        courses (object): Pandas Series containing the text data. 
        column (string): Name of column used, i.e. 'title'. 
        course_name (string): Name of title to get recommendations for, i.e. 1982 Ferrari 308 GTSi For Sale by Auction
        cosine_similarities (array): Cosine similarities matrix from linear_kernel
        
    Returns: 
        Pandas dataframe. 
    """

    # Return indices for the target dataframe column and drop any duplicates
    indices = pd.Series(courses).drop_duplicates()

    # Get the index for the target course_name
    count = 0
    for name in indices:
        if name == course_name:
            break
        else:
            count = count + 1
    target_index = count

    # Get the cosine similarity scores for the target course_name
    cosine_similarity_scores = list(enumerate(cosine_similarities[target_index]))

    # Sort the cosine similarities in order of closest similarity
    cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)

    # Return tuple of the requested closest scores excluding the target item and index
    cosine_similarity_scores = cosine_similarity_scores[1:len(courses)]

    # Extract the tuple course_names
    index = (x[0] for x in cosine_similarity_scores)
    scores = (x[1] for x in cosine_similarity_scores)

    # Get the indices for the closest items
    recommendation_indices = [i[0] for i in cosine_similarity_scores]

    # Get the actual recommendations
    recommendations = courses.iloc[recommendation_indices]

    # Return a recommendations
    recommendations = pd.DataFrame(tuple(zip(index, recommendations, scores)),
                                   columns=['Index','Course', 'Score'])
    
    # Take index from column 'index'
    idx = recommendations['Index']
    
    # Set and sort index 
    recommendations = recommendations.set_axis(idx).drop(columns='Index').sort_index()
    return recommendations

In [13]:
recommender_tfidf_all_courses('การวินิจฉัยภาวะฉุกเฉินจากอุบัติเหตุ (Diagnostic Radiology of Traumatic Emergency)')

Unnamed: 0_level_0,Course,Score
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Cancer Epidemiology and Prevention,0.0
1,Coaching Skill : Crafting a New You สำหรับนักศ...,0.0
2,Coaching Skill : Crafting a New You สำหรับบุคล...,0.0
3,Development Studies & Social Research Speciali...,0.0
4,Digital Transformation: e-Document,0.0
...,...,...
163,เวชศาสตร์ครอบครัวขั้นสูง,0.0
164,เวชศาสตร์ฟื้นฟูสำหรับแพทย์ฝึกอบรมเวชศาสตร์ฟื้น...,0.0
165,แปลงจุดแข็ง เป็นคุณค่า สร้างชีวิตสู่ความสุข (...,0.0
166,แปลงจุดแข็ง เป็นคุณค่า สร้างชีวิตสู่ความสุข (S...,0.0


In [14]:
def recommender_tfidf_by_user(user_name, n_recommendations):
    df = {
        'User': pd.Series(file['ชื่อ-นามสกุล (อังกฤษ)']),
        'Course': pd.Series(file['หลักสูตรอบรมระยะสั้น'])
    }

    user_course = pd.DataFrame(df)
    selected_user_name = user_course.loc[user_course['User'] == user_name]
    selected_courses = selected_user_name['Course']

    recommended_courses = [ recommender_tfidf_all_courses(x) for x in selected_courses]

    # pre dataframe
    df = pd.DataFrame({
        'Course': [],
        'Score': []
    }).rename_axis('Index')

    for x in recommended_courses:
        df = df._append(x)
    df =  df.sort_values('Score', ascending=False).drop_duplicates('Course')
    return df.head(n_recommendations)

In [15]:
recommender_tfidf_by_user('PORPHAING JANTIP', 10)

Unnamed: 0_level_0,Course,Score
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
43,การวินิจฉัยภาวะฉุกเฉินที่ไม่ได้เกิดจากอุบัติเห...,0.697794
42,การวินิจฉัยภาวะฉุกเฉินจากอุบัติเหตุ (Diagnosti...,0.697794
131,หลักการและพื้นฐานของเครื่องมือทางรังสีวิทยา (B...,0.360553
152,เตรียมความพร้อมทางรังสีวิทยาสำหรับบุคลากรทางกา...,0.249083
140,หลักเศรษฐศาสตร์สาธารณสุข (Principle of Health ...,0.221962
130,หลักการบริหารงานสาธารณสุข (Principle of Public...,0.193087
133,หลักสูตรย่อยที่ 1 พื้นฐานการดูแลสุขภาพสัตว์เลี...,0.191831
32,การตรวจวิเคราะห์พื้นฐานทางพิษวิทยาและการตรวจวิ...,0.180715
154,เทคโนโลยีก๊าซชีวภาพ : หลักการการออกแบบ และการใ...,0.157094
161,เรียนรู้การคิดเชิงออกแบบ (Design Thinking) : ท...,0.0


References

https://practicaldatascience.co.uk/data-science/how-to-create-content-recommendations-using-tf-idf
https://lukkiddd.com/tf-idf-%E0%B8%84%E0%B8%B3%E0%B9%84%E0%B8%AB%E0%B8%99%E0%B8%AA%E0%B8%B3%E0%B8%84%E0%B8%B1%E0%B8%8D%E0%B8%99%E0%B8%B0-dd1e1568312e

Training Test Part

Generate label in each course

In [16]:
def generate_labels(matrix, threshold):
    num_courses = len(matrix)
    labels = np.zeros((num_courses, num_courses), dtype=int)
    
    for i in range(num_courses):
        for j in range(i+1, num_courses):
            similarity = matrix[i][j]
            if similarity >= threshold:
                labels[i][j] = 1
                labels[j][i] = 1
    return labels

In [17]:
threshold_value = 0.2  # Assume the similarity is symmetric
labels = generate_labels(cosine_similarities, threshold_value)
print("Labels:")
print(labels)

Labels:
[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]]


In [18]:
def extract_similar_items(item_list, labels, number_of_courses):
    similar_items = []
    
    for i in range(number_of_courses):
        arr = []
        for j, label in enumerate(labels[i]):
            if label == 1:
                arr.append(item_list[j])
        similar_items.append(pd.Series(arr))

    return pd.Series(similar_items)

In [19]:
number_of_courses = len(labels)

similar_items = extract_similar_items(courses, labels, number_of_courses)

# Show the statistics
example_idx = 0  # change index of similar_item to observe similar items
example = similar_items[example_idx]
considered_course = courses[example_idx]

if len(example) == 1:
    print(f"Similar item to {considered_course} is:")
    print(example)
else:
    print(f"Similar items to {considered_course} are:")
    print(example)

Similar item to Cancer Epidemiology and Prevention is:
0    หลักวิทยาการระบาด (Principles of Epidemiology)...
dtype: object


Functions for label calculation

In [522]:
def most_weighted_word(labels):
    # Convert document and labels into a regular expression form
    labels = labels.str.lower()
    
    # TF-IDF Vectorization using the fit transform of all courses
    vectorizer = TfidfVectorizer(stop_words='english', vocabulary=labels)
    tfidf_matrix = vectorizer.fit_transform(courses_clean)
    
    # Sum up TF-IDF scores for each label across all documents
    total_tfidf_scores = np.sum(tfidf_matrix, axis=0)
    
    # Get the most weighted label index
    most_weighted_label_index = np.argmax(total_tfidf_scores)
    
    # Get the most weighted label
    most_weighted_label = labels[most_weighted_label_index]
    
    return most_weighted_label

def find_common_words(documents):
    # Step 1: Tokenize the items
    tokenized_items = [pd.Series(item.split()) for item in documents]

    # Step 2: Create lists of words for each item
    flattened_series = pd.concat(tokenized_items, ignore_index=True, names='word')
    flattened_series = [ element for element in flattened_series if element.isnumeric() == False] # Remove number element
    
    # Step 3: Find common words
    words_count = Counter(flattened_series)

    # Step 4: Get the keys for dictionary's max values
    max_words = [key for key, value in words_count.items() if value == max(words_count.values())]  
    max_words = pd.Series(max_words)
    
    # Step 5: Take the most weighted word
    if len(max_words) is 0:
        return documents[0]
    # result = most_weighted_word(max_words)
    # 
    # return result

  if len(max_words) is 0:


In [523]:
# Append the course to their similar courses 
item_list = []
for idx in range(number_of_courses):
    # Assuming similar_items and courses are predefined lists
    df = similar_items[idx]._append(pd.Series(courses[idx]))

    # Reindex the DataFrame with consecutive integers starting from 1
    df = df.reset_index(drop=True)
    
    # Rename the column properly
    df = df.rename('similar courses')
    
    # Apply the clean title
    df = df.apply(clean_title)
    
    # Find the most weighted word from similar courses
    most_similar_courses = find_common_words(df)
    
    # Append the modified DataFrame to item_list
    item_list.append(most_similar_courses)

item_list
# item_list = pd.Series(item_list, name='all similar courses')

# # Find duplicated word(s) from similar courses
# label = find_common_words(item_list)
# 
# print("duplicated words across different items:")
# label

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 '',
 None,
 None,
 None,
 '   2567  2',
 '   2567  3',
 '   2567  3',
 None,
 '',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 '  1',
 '  2',
 '',
 None,
 None,
 None,
 '  10',
 '  11',
 '  12',
 '  13',
 '  14',
 '  2',
 '  3',
 '  4',
 '  5',
 '  6',
 '  7',
 '  8',
 '  9',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 '',
 '',
 '',
 '  ',
 '',
 None,
 None,
 None,
 None,
 None,
 '',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 '',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 '',
 None,
 None,
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 None,
 None,
 ' ',
 None,
 None,
 None,
 None,
 '  1',
 '  2',
 '  3',
 '  4',
 '  5',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 Non

So far, I'm able to calculate the label in a selected course. The next step is to take a list of labels that are calculated from all the courses.

https://www.datacamp.com/tutorial/naive-bayes-scikit-learn
https://chat.openai.com/share/a3144868-3e0d-4584-b443-b6c49efb9117