# Import libraries

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
import numpy as np

# Read datasets

In [2]:
i_data = pd.read_excel('dataset/i_data.xlsx')
ui_data = pd.read_excel('dataset/ui_data.xlsx')

# Evaluation

In [36]:
# Assuming your dataset is stored in a pandas DataFrame called 'ui_data'
# and 'username' is the column containing names of users, 'course' is the column containing course name

# The names of users who have taken more than one course
usernames = ui_data['username'].value_counts()
usernames = usernames[usernames > 1].index

# Drop courses that have been taken more than 1 time by the same user
ui_data_drop = ui_data.drop_duplicates(subset=['username', 'course'])

# Calculate the number of courses each user has taken
courses_per_user = ui_data_drop.groupby('username')['course'].count().reset_index()
courses_per_user.columns = ['username', 'course_count']

# Merge the courses_per_user DataFrame back to the original DataFrame
merged_user_courses = ui_data_drop.merge(courses_per_user, on='username')

# Sort the DataFrame by username and course to ensure consistent train-test split
user_courses = merged_user_courses.sort_values(by=['username', 'course'])

# Initialize a counter variable to keep track of the number of courses for each user
course_counter = 1

# Create a list to store the split information (True for training, False for testing)
split_list = []

# Iterate through each row to determine the split
for index, row in user_courses.iterrows():
    if course_counter < row['course_count']:
        split_list.append(True)  # Training data
    else:
        split_list.append(False)  # Testing data
        course_counter = 1  # Reset the counter for the next user
    course_counter += 1

# Add the split information to the DataFrame
user_courses['split'] = split_list

# Split the DataFrame into training and testing sets based on the 'split' column
train_ui_data = user_courses[user_courses['split']]
test_ui_data = user_courses[~user_courses['split']]

# Drop the auxiliary columns used for splitting
train_ui_data = train_ui_data.drop(['course_count', 'split'], axis=1)
test_ui_data = test_ui_data.drop(['course_count', 'split'], axis=1)

train_ui_data = train_ui_data.set_axis(range(len(train_ui_data)))
test_ui_data = test_ui_data.set_axis(range(len(test_ui_data)))

# Now train_ui_data contains rows where users learned n - 1 courses for training,
# and test_ui_data contains rows where users learned course n for testing

### TF-IDF and Linear Kernel Performance

In [64]:
from src.isne_recommendation.TfidfLinearKernel import get_recommendations

In [65]:
hit = []
for name in usernames:
    predictions = get_recommendations(name, i_data, train_ui_data, 10)
    if type(predictions) is str:
        isHit = False
        hit.append(isHit)
    else:
        predictions = predictions['Course'].tolist()
        results = test_ui_data[test_ui_data['username'] == name]['course'].iloc[0]
        isHit = results in predictions
        hit.append(isHit)

In [67]:
hits = np.count_nonzero(hit)
accuracy = hits / len(usernames)
accuracy

0.07083333333333333

### Feature Ratings and KNN Performance

In [61]:
from src.isne_recommendation.FeatureRatingsKNN import get_recommendations

In [17]:
get_recommendations('Martha Long', ui_data, 10)

Unnamed: 0,Course,Cosine Distance,Score
0,Addiction Treatment: Clinical Skills for Healt...,0.670731,0.329269


In [62]:
hit = []
for name in usernames:
    predictions = get_recommendations(name, train_ui_data, 10)
    if type(predictions) is str:
        isHit = False
        hit.append(isHit)
    else:
        predictions = predictions['Course'].tolist()
        results = test_ui_data[test_ui_data['username'] == name]['course'].iloc[0]
        isHit = results in predictions
        hit.append(isHit)

In [63]:
hits = np.count_nonzero(hit)
accuracy = hits / len(usernames)
accuracy

0.21770833333333334

### Hybrid

In [59]:
from src.isne_recommendation.Hybrid import get_recommendations

In [19]:
get_recommendations('Billy Carter', i_data, ui_data, 10)

Unnamed: 0_level_0,Course,Score
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
54,Non-Equilibrium Applications of Statistical Th...,0.655798
5,Build Basic Generative Adversarial Networks (G...,0.246585
1,Addiction Treatment: Clinical Skills for Healt...,0.170167
15,Culminating Project in Health Informatics,0.072155
8,COVID-19 - A clinical update,0.067086
0,A Crash Course in Data Science,0.048736
63,Qualitative Research Methods,0.042523
79,Write A Feature Length Screenplay For Film Or ...,0.036554
2,Agile Meets Design Thinking,0.027061
68,Social Media Management,0.024238


In [60]:
hit = []
for name in usernames:
    predictions = get_recommendations(name, i_data, train_ui_data, 10)['Course'].tolist()
    results = test_ui_data[test_ui_data['username'] == name]['course'].iloc[0]
    isHit = results in predictions
    hit.append(isHit)

In [57]:
hits = np.count_nonzero(hit)
accuracy = hits / len(usernames)
accuracy

0.275

### Results 

- **The first approach**: _TF-IDF_ and _Linear Kernel_ Performance has an accuracy score of 7.08 percentage
- **The second approach**: _Feature Ratings_ and _KNN Performance_ has an accuracy score of 21.77 percentage
- **The third approach**: _Hybrid_ has an accuracy score of 27.5 percentage