Import required packages

In [6]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import translators as ts
import pickle
import sys
import numpy as np

Get filename from pickle

In [7]:
# Receive the user name from app.py
user_input = 'James Hill'

# Get filename from pickle
with open('./pickle/filename.pickle', 'rb') as f:
    filename = pickle.load(f)

Read the file

In [8]:
file_name = './uploads/Sample Dataset.xlsx'
file = pd.read_excel(file_name)

Language convert function

In [9]:
def translate_eng(text):
    return ts.translate_text(text)

def is_english(text):
    for char in text:
        if char.isalpha() and char.isascii():
            return True
    return False

Clean title with regular expresion

In [10]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

Create dataframe of courses

In [11]:
# Take the series of courses from dataset column
content = file['course'].drop_duplicates().fillna('')
courses = content.sort_values().set_axis(range(0,len(content)))

# Check if the course is in Thai language or not
is_english_courses = courses.apply(is_english)
thai_courses_not_trans = courses[is_english_courses == False]

# Translate courses in a thai language to en english language
thai_courses = thai_courses_not_trans.apply(translate_eng)
english_courses = courses[is_english_courses == True]

# Combine 2 series into a single series
combined_courses = thai_courses._append(english_courses)

# Convert combined courses to be in form of regular expression
courses_clean = combined_courses.apply(clean_title).sort_index()

In [12]:
number_of_courses = len(courses)

Create tfidf matrix

In [97]:
print(courses)

0    Basic Computer Programming for ISNE
1                             Calculus 1
2         Data Structures and Algorithms
3                    Engineering Drawing
4                  Fundamental English 1
5                   Introduction to ISNE
Name: course, dtype: object


In [110]:
# set default for all parameters in TfidfVectorizer
tfidf = TfidfVectorizer()

# get tf-df values
tfidf_matrix = tfidf.fit_transform(courses)
# get idf values
print('\nidf values:')
for ele1, ele2 in zip(tfidf.get_feature_names_out(), tfidf.idf_):
	print(ele1, ':', ele2)
	
# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf value:')
print(tfidf_matrix)


# in matrix form
print('\ntf-idf values in matrix form:')
print(tfidf_matrix.toarray())


idf values:
algorithms : 2.252762968495368
and : 2.252762968495368
basic : 2.252762968495368
calculus : 2.252762968495368
computer : 2.252762968495368
data : 2.252762968495368
drawing : 2.252762968495368
engineering : 2.252762968495368
english : 2.252762968495368
for : 2.252762968495368
fundamental : 2.252762968495368
introduction : 2.252762968495368
isne : 1.8472978603872037
programming : 2.252762968495368
structures : 2.252762968495368
to : 2.252762968495368

Word indexes:
{'basic': 2, 'computer': 4, 'programming': 13, 'for': 9, 'isne': 12, 'calculus': 3, 'data': 5, 'structures': 14, 'and': 1, 'algorithms': 0, 'engineering': 7, 'drawing': 6, 'fundamental': 10, 'english': 8, 'introduction': 11, 'to': 15}

tf-idf value:
  (0, 12)	0.37935894668725584
  (0, 9)	0.4626247911559474
  (0, 13)	0.4626247911559474
  (0, 4)	0.4626247911559474
  (0, 2)	0.4626247911559474
  (1, 3)	1.0
  (2, 0)	0.5
  (2, 1)	0.5
  (2, 14)	0.5
  (2, 5)	0.5
  (3, 6)	0.7071067811865476
  (3, 7)	0.7071067811865476
  (4

Create path

In [111]:
import os
# Specify the folder path
folder_path = '/workspaces/recommendation-system/pickle'

# Create the folder if it doesn't exist
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

Export variable

In [112]:
file_path = os.path.join(folder_path, 'tfidf_matrix.pickle')
with open(file_path, 'wb') as f:
    pickle.dump(tfidf_matrix, f)

Create cosine similarities

In [114]:
cosine_similarities = linear_kernel(tfidf_matrix)
cosine_similarities

array([[1.        , 0.        , 0.        , 0.        , 0.        ,
        0.19029138],
       [0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        ],
       [0.19029138, 0.        , 0.        , 0.        , 0.        ,
        1.        ]])

Create recommendation system function

In [12]:
def recommender_tfidf(course_name):
    """Return a dataframe of content recommendations based on TF-IDF cosine similarity.
    
    Args:
        courses (object): Pandas Series containing the text data. 
        column (string): Name of column used, i.e. 'title'. 
        course_name (string): Name of title to get recommendations for, i.e. 1982 Ferrari 308 GTSi For Sale by Auction
        cosine_similarities (array): Cosine similarities matrix from linear_kernel
        
    Returns: 
        Pandas dataframe. 
    """

    # Return indices for the target dataframe column and drop any duplicates
    indices = pd.Series(courses).drop_duplicates()
    
    # Get the index for the target course_name
    target_index = indices[indices == course_name].index[0]

    # Get the cosine similarity scores for the target course_name
    cosine_similarity_scores = list(enumerate(cosine_similarities[target_index]))
    
    # Sort the cosine similarities in order of closest similarity
    cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Return tuple of the requested closest scores excluding the target item and index
    cosine_similarity_scores = cosine_similarity_scores[1:number_of_courses]
    cosine_similarity_scores = [(i, score) for i, score in cosine_similarity_scores if score != 0]
    
    # Extract the tuple course_names
    index = (x[0] for x in cosine_similarity_scores)
    scores = (x[1] for x in cosine_similarity_scores)
    
    # Get the indices for the closest items
    recommendation_indices = [i[0] for i in cosine_similarity_scores]
    
    # Get the actual recommendations
    recommendations = courses.iloc[recommendation_indices]
    
    # Return a recommendations
    recommendations = pd.DataFrame(tuple(zip(index, recommendations, scores)),
                      columns=['Index','Course', 'Cosine Similarity Score'])
    
    # Take index from column 'index'
    idx = recommendations['Index']
    
    # Set and sort index
    recommendations = recommendations.set_axis(idx).drop(columns='Index')
    return recommendations

In [13]:
def recommender_tfidf_all_courses(course_name):
    """Return a dataframe of content recommendations based on TF-IDF cosine similarity.
    
    Args:
        courses (object): Pandas Series containing the text data. 
        column (string): Name of column used, i.e. 'title'. 
        course_name (string): Name of title to get recommendations for, i.e. 1982 Ferrari 308 GTSi For Sale by Auction
        cosine_similarities (array): Cosine similarities matrix from linear_kernel
        
    Returns: 
        Pandas dataframe. 
    """

    # Return indices for the target dataframe column and drop any duplicates
    indices = pd.Series(courses).drop_duplicates()

    # Get the index for the target course_name
    count = 0
    for name in indices:
        if name == course_name:
            break
        else:
            count = count + 1
    target_index = count

    # Get the cosine similarity scores for the target course_name
    cosine_similarity_scores = list(enumerate(cosine_similarities[target_index]))

    # Sort the cosine similarities in order of closest similarity
    cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)

    # Return tuple of the requested closest scores excluding the target item and index
    cosine_similarity_scores = cosine_similarity_scores[1:len(courses)]

    # Extract the tuple course_names
    index = (x[0] for x in cosine_similarity_scores)
    scores = (x[1] for x in cosine_similarity_scores)

    # Get the indices for the closest items
    recommendation_indices = [i[0] for i in cosine_similarity_scores]

    # Get the actual recommendations
    recommendations = courses.iloc[recommendation_indices]

    # Return a recommendations
    recommendations = pd.DataFrame(tuple(zip(index, recommendations, scores)),
                                   columns=['Index','Course', 'Score'])
    
    # Take index from column 'index'
    idx = recommendations['Index']
    
    # Set and sort index 
    recommendations = recommendations.set_axis(idx).drop(columns='Index').sort_index()
    return recommendations

In [14]:
def recommender_tfidf_by_user(user_name, n_recommendations):

    n_recommendations = number_of_courses - 1
    
    df = {
        'User': pd.Series(file['username']),
        'Course': pd.Series(file['course'])
    }

    user_course = pd.DataFrame(df)
    selected_user_name = user_course.loc[user_course['User'] == user_name]
    selected_courses = selected_user_name['Course']

    recommended_courses = [ recommender_tfidf_all_courses(x) for x in selected_courses]

    # pre dataframe
    df = pd.DataFrame({
        'Course': [],
        'Score': []
    }).rename_axis('Index')

    for x in recommended_courses:
        df = df._append(x)
    df =  df.sort_values('Score', ascending=False).drop_duplicates('Course')
    return df.head(n_recommendations)

Export to web page

In [16]:
# print(recommender_tfidf_by_user(user_input).to_html(index=False))

References

https://practicaldatascience.co.uk/data-science/how-to-create-content-recommendations-using-tf-idf
https://lukkiddd.com/tf-idf-%E0%B8%84%E0%B8%B3%E0%B9%84%E0%B8%AB%E0%B8%99%E0%B8%AA%E0%B8%B3%E0%B8%84%E0%B8%B1%E0%B8%8D%E0%B8%99%E0%B8%B0-dd1e1568312e

Training Test Part

Define how much similarity is to be recommended

In [17]:
threshold_value = 0.5  # Assume the similarity is symmetric

Calculate X and y

In [18]:
X = tfidf_matrix.toarray()

# Create a DataFrame from courses and item_list
def get_mean(item):
    arr = recommender_tfidf(item)['Cosine Similarity Score']
    if arr.empty:
        return 0
    else:
        return arr.mean()
    

def get_label(item):
    mean = get_mean(item)
    if mean > threshold_value:
        return 'recommended'
    else:
        return 'not recommended'
    
y = [ get_label(course) for course in courses ]

Export variable

In [19]:
file_path = os.path.join(folder_path, 'tfidf_X.pickle')
with open(file_path, 'wb') as f:
    pickle.dump(X, f)

file_path = os.path.join(folder_path, 'tfidf_y.pickle')
with open(file_path, 'wb') as f:
    pickle.dump(y, f)

Reference

https://www.datacamp.com/tutorial/naive-bayes-scikit-learn
https://chat.openai.com/share/a3144868-3e0d-4584-b443-b6c49efb9117