Import required packages

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import linear_kernel
from copy import deepcopy
from scipy.spatial.distance import pdist, squareform
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from fuzzywuzzy import process
from sklearn.model_selection import train_test_split

Read the file

In [None]:
file = pd.read_excel('shortcourses2566.xlsx')

Count the number of courses

In [None]:
course_counts = pd.DataFrame(file)['หลักสูตรอบรมระยะสั้น'].value_counts()
courses = pd.Series(course_counts.index)
courses = courses.sort_values().set_axis(range(0,len(courses)))

Create Series of users

In [None]:
s_name = file.loc[:, 'ชื่อ-นามสกุล (อังกฤษ)']
Users = pd.Series(s_name, name='User')
Users

Create Series of emails

In [None]:
s_email = file.loc[:, 'อีเมล'].fillna("")
Emails = pd.Series(s_email ,name='Email')
Emails

Provide a score to each user based on their email domain

In [None]:
email_score = []
for data in Emails:
    if data != '':
        if data.split('@')[1] == 'cmu.ac.th':
            email_score.append(2)
        else:
            email_score.append(1)
    else:
        email_score.append(0)
email_score = pd.Series(email_score, name='Score Email')

email_score

Email Score Statistic

In [None]:
zero_score_count = email_score.where(email_score == 0).count()
one_score_count = email_score.where(email_score == 1).count()
two_score_count = email_score.where(email_score == 2).count()

print("Number of students who fill cmu email:", two_score_count)
print("Number of students who fill other email:", one_score_count)
print("Number of students who do not fill email:", zero_score_count)

Create function to calculate age-education score

In [None]:
def getAgeEducationScore(age, limit_age):
    if age <= limit_age:
        score = 1
    elif limit_age == 0:
        score = 0
    else:
        score = 3
    return score

Create set of the educational range

In [None]:
set_nan = {'อื่นๆ (-)', np.nan}
set_primaryschool = {'ประถมศึกษา', 'อื่นๆ (ป.4)', 'อื่นๆ (ป.7)', 'อื่นๆ (ป7)'}
set_middleschool = {'มัธยมศึกษาตอนต้น', 'Secondary school', 'อื่นๆ (มศ.3)'}
set_highschool = {'มัธยมศึกษาตอนปลาย', 'High school', 'Vocational', 'การศึกษานอกระบบ', 
                  'ประกาศนียบัตรวิชาชีพ (ปวช.)', 'อื่นๆ (ม.ปลาย จบหลักสูตรEMR เป็นจนท.ปฏิบัติการ)',
                  'อื่นๆ (กำลังศึกษาชั้นมัธยมศึกษาตอนปลาย)', 'อื่นๆ (กำลังศึกษาชั้นมัธยมศึกษาปีที่6)', 
                  'อื่นๆ (มศ.5)'}
set_bachelor = {'ปริญญาตรี', 'Bachelor degree', 'Diploma', 'High Vocational', 
                'ประกาศนียบัตรวิชาชีพชั้นสูง (ปวส.)', 'อื่นๆ (กำลังศึกษาในระดับปริญญาตรี)', 
                'อื่นๆ (กำลังศึกษาปริญญาตรี สาขารังสีเทคนิค)', 'อื่นๆ (ปริญญาแพทยศาสตร์บัณฑิต)', 
                'อื่นๆ (นักศึกษาแพทย์ปี 5)', 'อื่นๆ (นักศึกษาแพทย์ มช ปี4 ศูนย์เชียงราย)', 
                'อื่นๆ (แพทยศาสตร์บัณฑิต)', 'อื่นๆ (แพทย์)', 'อื่นๆ (ประกาศณียบัตรผู้ช่วยพยาบาล)', 
                'อนุปริญญา', 'อื่นๆ (ป.ตรี)', 'อื่นๆ (ผู้ช่วยพยาบาล)'}
set_masterdocter = {'ปริญญาโท', 'ปริญญาเอก', "Master's degree", 'Other (OBGYN specalist lavel 1)', 
                    'Other (Residency)', 'Ph.D.', 'อื่นๆ (Internal Medicine)', 
                    'อื่นๆ (เฉพาะทาง)', 'อื่นๆ (วุฒิบัตร)', 'อื่นๆ (วว.ออร์โธปิดิกส์)', 
                    'อื่นๆ (วุฒิบัตรแสดงความรู้ความชำนาญในการประกอบวิชาชีพเภสัชกรรม สาขาเภสัชบำบัด)', 
                    'อื่นๆ (วุฒิบัตรผู้เชี่ยวชาญสาขาทันตกรรมทั่วไป)', 'อื่นๆ (วุฒิบัตรศัลยศาสตร์และแม็กซิลโลเฟเชียล)'}

list_degree = ((set_nan, 0), (set_primaryschool, 16), (set_middleschool, 19), 
               (set_highschool, 22), (set_bachelor,26), (set_masterdocter,40))

Create Series of Age-Education

In [None]:
ages = file.loc[:, 'อายุ']
educations = file.loc[:, 'วุฒิการศึกษา']
age_education_scores = []

for i,x in enumerate(educations):
    for y in list_degree:
        if x in y[0]:
            age_education_scores.append(getAgeEducationScore(ages[i], y[1]))
            
age_education_scores = pd.Series(age_education_scores, name='Age Education Score')

Age-Education Score Statistic

In [None]:
zero_score_count = age_education_scores.where(age_education_scores == 0).count()
one_score_count = age_education_scores.where(age_education_scores == 1).count()
three_score_count = age_education_scores.where(age_education_scores == 3).count()

print("Number of students who do not specify education:", zero_score_count)
print("Number of students who do not specify education:", one_score_count)
print("Number of students who do not specify education:", three_score_count)

Create Series of status

In [None]:
status = file.loc[:, 'สถานะ'].fillna("")
status = pd.Series(status ,name='Status')
status

Provide a score to each user based on their purchase status

In [None]:
status_score = []
for x in status:
    if x == 'ชำระเงิน':
        status_score.append(8)
    if x == 'ไม่ผ่านการอนุมัติ':
        status_score.append(7)
    if x == 'ค้างชำระ':
        status_score.append(5)
status_score = pd.Series(status_score)

Purchase Status Score Statistics

In [None]:
five_score_count = status_score.where(status_score == 5).count()
seven_score_count = status_score.where(status_score == 7).count()
eight_score_count = status_score.where(status_score == 8).count()

print("Number of students who are in arrears:", five_score_count)
print("Number of students whose payment was not approved:", seven_score_count)
print("Number of students with payment approval:", eight_score_count)

Create Series of address

In [None]:
address = file.loc[:, 'ที่อยู่'].fillna("")
address = pd.Series(address ,name='Status')
address

Provide a score to each user based on whether they provide address information or not

In [None]:
address_score = [ 1 if x == '' else 2 for x in address]
address_score = pd.Series(address_score)

Address Score Statistic

In [None]:
three_score_count = address_score.where(address_score == 1).count()
four_score_count = address_score.where(address_score == 2).count()

print("Number of students who did not fill address:", three_score_count)
print("Number of students who filled address:", four_score_count)

Convert list to pandas series

In [None]:
email_score = pd.Series(email_score)
age_education_scores = pd.Series(age_education_scores)
status_score = pd.Series(status_score)
address_score = pd.Series(address_score)

Create DataFrame by merging these 4 Series and calculate impressive level

In [None]:
d = {
    'Email Score': email_score,
    'Age Education Score': age_education_scores,
    'Payment Score': status_score,
    'Address Score': address_score,
    'Point': email_score + status_score + address_score + age_education_scores,
    'Impressive Level': ( email_score + status_score + address_score + age_education_scores ) / 17
}
df = pd.DataFrame(d)
df

Create user-course table

In [None]:
user = file.loc[:, 'ชื่อ-นามสกุล (อังกฤษ)']
course = file.loc[:, 'หลักสูตรอบรมระยะสั้น']
score = df['Impressive Level']
# all user, course, score have the same length
data = {
    'User': user,
    'Course': course,
    'Score': score,
}

predata = pd.DataFrame(data)
predata

Calculate sparsity and csr matrix

In [None]:
# Pivot table by rotating course
data = predata.pivot_table(index='Course', columns='User', values='Score').fillna(0)
data.head()

In [None]:
# Item-User Rated Matrix
data_mtx = csr_matrix(data.values)

Euclidean Distance & Cosine Similarity

In [None]:
# model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=int(np.around(math.sqrt(len(courses))))).fit(data_mtx)
model_knn = NearestNeighbors(metric='cosine').fit(data_mtx)
model_knn

In [None]:
def recommender_knn(course_name, n_recommendations):
    idx = process.extractOne(course_name, courses)[2]
    print('Selected movie:', courses[idx], 'Index:', idx)
    distances, indices = model_knn.kneighbors(data_mtx[idx], n_neighbors=n_recommendations, return_distance=True)
    recommendations = [courses[i].where(i!=idx) for i in indices]
    recommended_courses = recommendations[0][1:]
    course_distances = distances[0][1:]
    d = {
        'Course': recommended_courses,
        'Cosine Distance': course_distances
    }
    results = pd.DataFrame(data=d)
    return results

In [None]:
recommender_knn('การวินิจฉัยภาวะฉุกเฉินจากอุบัติเหตุ (Diagnostic Radiology of Traumatic Emergency)', 100)

Predata for hybrid recommendation

In [None]:
def recommender_knn_all_courses(course_name):
    model_knn.fit(data_mtx)
    idx = process.extractOne(course_name, courses)[2]
    print('Selected movie:', courses[idx], 'Index:', idx)
    distances, indices = model_knn.kneighbors(data_mtx[idx], n_neighbors=len(courses))
    recommendations = [courses[i].where(i!=idx) for i in indices]
    recommended_courses = recommendations[0][1:]
    scores = 1 - distances
    course_distances = scores[0][1:]
    d = {
        'Course': recommended_courses,
        'Score': course_distances
    }
    results = pd.DataFrame(data=d)
    results = results.sort_index().rename_axis('Index')
    return results

In [None]:
recommender_knn_all_courses('การวินิจฉัยภาวะฉุกเฉินจากอุบัติเหตุ (Diagnostic Radiology of Traumatic Emergency)')

In [213]:
def recommender_knn_by_user(user_name, n_recommendations):
    df = {
        'User': pd.Series(file['ชื่อ-นามสกุล (อังกฤษ)']),
        'Course': pd.Series(file['หลักสูตรอบรมระยะสั้น'])
    }
    
    user_course = pd.DataFrame(df)
    selected_user_name = user_course.loc[user_course['User'] == user_name]
    selected_courses = selected_user_name['Course']
    
    recommended_courses = [ recommender_knn_all_courses(x) for x in selected_courses]
    
    # pre dataframe
    df = pd.DataFrame({
        'Course': [],
        'Score': []
    }).rename_axis('Index')
    
    for x in recommended_courses:
        df = df._append(x)
    df =  df.sort_values('Score', ascending=False).drop_duplicates('Course')
    return df.head(n_recommendations)

Items show permanence whereas, people change with time
Items are fewer in numbers to deal with. Which leads to smaller similarity matrix. Amazon and Netflix use it!
Better for New users:
— Him selecting just one item will let us provide recommendations
— But for user based, new user has to wait until next build of similarity matrix (which is the only computational part of the framework)

In [214]:
recommender_knn_by_user('PORPHAING JANTIP', 10)

Selected movie: หลักการและพื้นฐานของเครื่องมือทางรังสีวิทยา (Basic Principle of Diagnostic Radiology Imaging Instruments) Index: 131
Selected movie: การวินิจฉัยภาวะฉุกเฉินที่ไม่ได้เกิดจากอุบัติเหตุ (Diagnostic Radiology of Non-Traumatic Emergency) Index: 43
Selected movie: การวินิจฉัยภาวะฉุกเฉินจากอุบัติเหตุ (Diagnostic Radiology of Traumatic Emergency) Index: 42


Unnamed: 0_level_0,Course,Score
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
43,การวินิจฉัยภาวะฉุกเฉินที่ไม่ได้เกิดจากอุบัติเห...,0.754585
42,การวินิจฉัยภาวะฉุกเฉินจากอุบัติเหตุ (Diagnosti...,0.754585
152,เตรียมความพร้อมทางรังสีวิทยาสำหรับบุคลากรทางกา...,0.326572
131,หลักการและพื้นฐานของเครื่องมือทางรังสีวิทยา (B...,0.279851
110,รังสีวิทยาวินิจฉัย,0.258932
124,สารสนเทศทางสาธารณสุข 2566 (Public Health Infor...,0.06961
163,เวชศาสตร์ครอบครัวขั้นสูง,0.061968
41,การวิจัยแบบผสมผสานทางสุขภาพ (Mixed Methods Res...,0.032926
85,ความรู้พื้นฐานในการทำวิจัยเพื่อพัฒนางาน (Resea...,0.027183
117,วิทยาศาสตร์การแพทย์คลินิก สาขาวิชาเวชศาสตร์ฉุก...,0.026566
