Import required packages

In [1]:
import pandas as pd
import re
import numpy as np
import spacy
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
import translators as ts

Read the file

In [2]:
file = pd.read_excel('shortcourses2566.xlsx')

Language convert function

In [3]:
def translate_eng(text):
    return ts.translate_text(text)

def is_english(text):
    for char in text:
        if char.isalpha() and char.isascii():
            return True
    return False
            

Clean title with regular expresion

In [4]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

Create dataframe of courses

In [5]:
# Take the series of courses from dataset column
content = file['หลักสูตรอบรมระยะสั้น'].drop_duplicates().fillna('')
courses = content.sort_values().set_axis(range(0,len(content)))

# Check if the course is in Thai language or not
is_english_courses = courses.apply(is_english)
thai_courses_not_trans = courses[is_english_courses == False]

# Translate courses in a thai language to en english language
thai_courses = thai_courses_not_trans.apply(translate_eng)
english_courses = courses[is_english_courses == True]

# Combine 2 series into a single series
combined_courses = thai_courses._append(english_courses)

# Convert combined courses to be in form of regular expression
courses_clean = combined_courses.apply(clean_title).sort_index()

Create tfidf matrix

In [6]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(courses_clean)

Create cosine similarities

In [7]:
cosine_similarities = linear_kernel(tfidf_matrix)

Create recommendation system function

In [8]:
def recommender_tfidf(course_name, limit):
    """Return a dataframe of content recommendations based on TF-IDF cosine similarity.
    
    Args:
        courses (object): Pandas Series containing the text data. 
        column (string): Name of column used, i.e. 'title'. 
        course_name (string): Name of title to get recommendations for, i.e. 1982 Ferrari 308 GTSi For Sale by Auction
        cosine_similarities (array): Cosine similarities matrix from linear_kernel
        limit (int, optional): Optional limit on number of recommendations to return. 
        
    Returns: 
        Pandas dataframe. 
    """

    # Return indices for the target dataframe column and drop any duplicates
    indices = pd.Series(courses).drop_duplicates()
    
    # Get the index for the target course_name
    count = 0
    for name in indices:
        if name == course_name:
            break
        else:
            count = count + 1
    target_index = count

    # Get the cosine similarity scores for the target course_name
    cosine_similarity_scores = list(enumerate(cosine_similarities[target_index]))
    
    # Sort the cosine similarities in order of closest similarity
    cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Return tuple of the requested closest scores excluding the target item and index
    cosine_similarity_scores = cosine_similarity_scores[1:limit+1]
    
    # Extract the tuple course_names
    index = (x[0] for x in cosine_similarity_scores)
    scores = (x[1] for x in cosine_similarity_scores)
    
    # Get the indices for the closest items
    recommendation_indices = [i[0] for i in cosine_similarity_scores]
    
    # Get the actual recommendations
    recommendations = courses.iloc[recommendation_indices]
    
    # Return a recommendations
    recommendations = pd.DataFrame(tuple(zip(index, recommendations, scores)),
                      columns=['Index','Course', 'Cosine Similarity Score'])
    
    # Take index from column 'index'
    idx = recommendations['Index']
    
    # Set and sort index
    recommendations = recommendations.set_axis(idx).drop(columns='Index')
    return recommendations

Test recommendation system using TF-IDF

In [9]:
recommender_tfidf('Cancer Epidemiology and Prevention', 10)

Unnamed: 0_level_0,Course,Cosine Similarity Score
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
132,หลักวิทยาการระบาด (Principles of Epidemiology)...,0.372441
37,การป้องกันการล้มสำหรับผู้สูงอายุ,0.341441
33,การบริบาลทางเภสัชกรรม สาขาผู้ป่วยมะเร็ง (ปี 25...,0.258951
1,Coaching Skill : Crafting a New You สำหรับนักศ...,0.0
2,Coaching Skill : Crafting a New You สำหรับบุคล...,0.0
3,Development Studies & Social Research Speciali...,0.0
4,Digital Transformation: e-Document,0.0
5,Environmental Economics (USAC programs) - Fall...,0.0
6,Global Citizenship – โรงเรียนมงฟอร์ตวิทยาลัย ส...,0.0
7,Global Citizenship – โรงเรียนมงฟอร์ตวิทยาลัย ส...,0.0


Predata for hybrid recommendation

In [10]:
def recommender_tfidf_all_courses(course_name):
    """Return a dataframe of content recommendations based on TF-IDF cosine similarity.
    
    Args:
        courses (object): Pandas Series containing the text data. 
        column (string): Name of column used, i.e. 'title'. 
        course_name (string): Name of title to get recommendations for, i.e. 1982 Ferrari 308 GTSi For Sale by Auction
        cosine_similarities (array): Cosine similarities matrix from linear_kernel
        
    Returns: 
        Pandas dataframe. 
    """

    # Return indices for the target dataframe column and drop any duplicates
    indices = pd.Series(courses).drop_duplicates()

    # Get the index for the target course_name
    count = 0
    for name in indices:
        if name == course_name:
            break
        else:
            count = count + 1
    target_index = count

    # Get the cosine similarity scores for the target course_name
    cosine_similarity_scores = list(enumerate(cosine_similarities[target_index]))

    # Sort the cosine similarities in order of closest similarity
    cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)

    # Return tuple of the requested closest scores excluding the target item and index
    cosine_similarity_scores = cosine_similarity_scores[1:len(courses)]

    # Extract the tuple course_names
    index = (x[0] for x in cosine_similarity_scores)
    scores = (x[1] for x in cosine_similarity_scores)

    # Get the indices for the closest items
    recommendation_indices = [i[0] for i in cosine_similarity_scores]

    # Get the actual recommendations
    recommendations = courses.iloc[recommendation_indices]

    # Return a recommendations
    recommendations = pd.DataFrame(tuple(zip(index, recommendations, scores)),
                                   columns=['Index','Course', 'Score'])
    
    # Take index from column 'index'
    idx = recommendations['Index']
    
    # Set and sort index 
    recommendations = recommendations.set_axis(idx).drop(columns='Index').sort_index()
    return recommendations

In [11]:
recommender_tfidf_all_courses('การวินิจฉัยภาวะฉุกเฉินจากอุบัติเหตุ (Diagnostic Radiology of Traumatic Emergency)')

Unnamed: 0_level_0,Course,Score
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Cancer Epidemiology and Prevention,0.0
1,Coaching Skill : Crafting a New You สำหรับนักศ...,0.0
2,Coaching Skill : Crafting a New You สำหรับบุคล...,0.0
3,Development Studies & Social Research Speciali...,0.0
4,Digital Transformation: e-Document,0.0
...,...,...
163,เวชศาสตร์ครอบครัวขั้นสูง,0.0
164,เวชศาสตร์ฟื้นฟูสำหรับแพทย์ฝึกอบรมเวชศาสตร์ฟื้น...,0.0
165,แปลงจุดแข็ง เป็นคุณค่า สร้างชีวิตสู่ความสุข (...,0.0
166,แปลงจุดแข็ง เป็นคุณค่า สร้างชีวิตสู่ความสุข (S...,0.0


In [12]:
def recommender_tfidf_by_user(user_name, n_recommendations):
    df = {
        'User': pd.Series(file['ชื่อ-นามสกุล (อังกฤษ)']),
        'Course': pd.Series(file['หลักสูตรอบรมระยะสั้น'])
    }

    user_course = pd.DataFrame(df)
    selected_user_name = user_course.loc[user_course['User'] == user_name]
    selected_courses = selected_user_name['Course']

    recommended_courses = [ recommender_tfidf_all_courses(x) for x in selected_courses]

    # pre dataframe
    df = pd.DataFrame({
        'Course': [],
        'Score': []
    }).rename_axis('Index')

    for x in recommended_courses:
        df = df._append(x)
    df =  df.sort_values('Score', ascending=False).drop_duplicates('Course')
    return df.head(n_recommendations)

In [13]:
recommender_tfidf_by_user('PORPHAING JANTIP', 10)

Unnamed: 0_level_0,Course,Score
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
43,การวินิจฉัยภาวะฉุกเฉินที่ไม่ได้เกิดจากอุบัติเห...,0.676301
42,การวินิจฉัยภาวะฉุกเฉินจากอุบัติเหตุ (Diagnosti...,0.676301
110,รังสีวิทยาวินิจฉัย,0.655652
131,หลักการและพื้นฐานของเครื่องมือทางรังสีวิทยา (B...,0.348124
152,เตรียมความพร้อมทางรังสีวิทยาสำหรับบุคลากรทางกา...,0.277816
29,การขับร้องประสานเสียงขั้นพื้นฐาน,0.256033
117,วิทยาศาสตร์การแพทย์คลินิก สาขาวิชาเวชศาสตร์ฉุก...,0.231116
140,หลักเศรษฐศาสตร์สาธารณสุข (Principle of Health ...,0.226498
130,หลักการบริหารงานสาธารณสุข (Principle of Public...,0.196069
133,หลักสูตรย่อยที่ 1 พื้นฐานการดูแลสุขภาพสัตว์เลี...,0.183915


References

https://practicaldatascience.co.uk/data-science/how-to-create-content-recommendations-using-tf-idf
https://lukkiddd.com/tf-idf-%E0%B8%84%E0%B8%B3%E0%B9%84%E0%B8%AB%E0%B8%99%E0%B8%AA%E0%B8%B3%E0%B8%84%E0%B8%B1%E0%B8%8D%E0%B8%99%E0%B8%B0-dd1e1568312e

Training Test Part

Generate label in each course

In [14]:
def generate_labels(matrix, threshold):
    num_courses = len(matrix)
    labels = np.zeros((num_courses, num_courses), dtype=int)
    
    for i in range(num_courses):
        for j in range(i+1, num_courses):
            similarity = matrix[i][j]
            if similarity >= threshold:
                labels[i][j] = 1
                labels[j][i] = 1
    return labels

In [15]:
threshold_value = 0.2  # Assume the similarity is symmetric
labels = generate_labels(cosine_similarities, threshold_value)
print("Labels:")
print(labels)

Labels:
[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]]


In [16]:
def extract_similar_items(item_list, labels, number_of_courses):
    similar_items = []
    
    for i in range(number_of_courses):
        arr = []
        for j, label in enumerate(labels[i]):
            if label == 1:
                arr.append(item_list[j])
        similar_items.append(pd.Series(arr))

    return pd.Series(similar_items)

Prepare for data

In [17]:
number_of_courses = len(labels)
similar_items = extract_similar_items(courses, labels, number_of_courses)

en = spacy.load('en_core_web_lg')  # Load the English model
stopwords = en.Defaults.stop_words  # Get the default stop words
courses_stopwords = courses_clean.apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
courses_stopwords

0                         Cancer Epidemiology Prevention
1                        Coaching Skill Crafting New You
2                        Coaching Skill Crafting New You
3      Development Studies Social Research Specializa...
4                       Digital Transformation eDocument
                             ...                        
163                             Advanced Family Medicine
164    Rehabilitation Rehabilitation Training Chiang ...
165      Strengths Spotting Turn value bring happiness 1
166      Strengths Spotting Turn value bring happiness 2
167                                          Skills4Life
Name: หลักสูตรอบรมระยะสั้น, Length: 168, dtype: object

Show the statistics

In [18]:
example_idx = 16  # change index of similar_item to observe similar items
example = similar_items[example_idx]
considered_course = courses[example_idx]

if len(example) == 1:
    print(f"Similar item to {considered_course} is:")
    print(example)
else:
    print(f"Similar items to {considered_course} are:")
    print(example)

Similar items to Political Science and Public Administration for International Students (USAC programs) – Fall Semester 2023 are:
0     Environmental Economics (USAC programs) - Fall...
1     Political Science and Public Administration fo...
2     Sociology and Anthropology of Northern Thailan...
3     Sociology and Anthropology of Northern Thailan...
4     Sustainable Agriculture Development (USAC prog...
5     Sustainable Agriculture Development (USAC prog...
6     Thai Cultural Education for International Stud...
7     Thai Cultural Education for International Stud...
8     Thai Cultural Education for International Stud...
9     Thai Cultural Education for International Stud...
10    World Economic Issues (USAC program) – Spring ...
11    สารสนเทศทางสาธารณสุข 2566 (Public Health Infor...
12    หลักการบริหารงานสาธารณสุข (Principle of Public...
13              เทคโนโลยีและการค้าระหว่างประเทศ ปี 2566
dtype: object


Functions for label calculation

In [19]:
def most_weighted_word(labels):
    # Convert document and labels into a regular expression form
    labels = labels.str.lower()
    
    # TF-IDF Vectorization using the fit transform of all courses
    vectorizer = TfidfVectorizer(stop_words='english', vocabulary=labels)
    tfidf_matrix = vectorizer.fit_transform(courses_stopwords)
    
    # Sum up TF-IDF scores for each label across all documents
    total_tfidf_scores = np.sum(tfidf_matrix, axis=0)
    
    # Get the most weighted label index
    most_weighted_label_index = np.argmax(total_tfidf_scores)
    
    # Get the most weighted label
    most_weighted_label = labels[most_weighted_label_index]
    
    return most_weighted_label

def find_common_words(documents):
    # Step 1: Tokenize the items
    tokenized_items = [pd.Series(item.split()) for item in documents]

    # Step 2: Create lists of words for each item
    flattened_series = pd.Series(pd.concat(tokenized_items, ignore_index=True, names='word'))
    flattened_series = flattened_series[~flattened_series.str.isnumeric()]  # Remove numbers
    
    # Step 3: Find common words
    words_count = Counter(flattened_series)

    # Step 4: Get the keys for dictionary's max values
    max_words = [key for key, value in words_count.items() if value == max(words_count.values())]  
    max_words = pd.Series(max_words)
    
    # Step 5: Take the most weighted word
    result = most_weighted_word(max_words)
    return result

In [20]:
# Append the course to their similar courses
item_list = []
for idx in range(number_of_courses):
    # Assuming similar_items and courses are predefined lists
    df = pd.Series(courses_stopwords[idx])._append(similar_items[idx])

    # Reindex the DataFrame with consecutive integers starting from 1
    df = df.reset_index(drop=True)
    
    # Rename the column properly
    df = df.rename('similar courses')
    
    # Apply the clean title
    df = df.apply(clean_title)
    
    # Find the most weighted word from similar courses
    most_similar_courses = find_common_words(df)
    
    # Append the modified DataFrame to item_list
    item_list.append(most_similar_courses)

item_list = pd.Series(item_list, name='all similar courses')

So far, I'm able to calculate the label in a selected course. The next step is to take a list of labels that are calculated from all the courses.

https://www.datacamp.com/tutorial/naive-bayes-scikit-learn
https://chat.openai.com/share/a3144868-3e0d-4584-b443-b6c49efb9117