# Import libraries

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
import numpy as np

# Read datasets

In [2]:
i_data = pd.read_excel('dataset/i_data.xlsx')
ui_data = pd.read_excel('dataset/ui_data.xlsx')

# Evaluation

In [3]:
from src.isne_recommendation.TfidfLinearKernel import get_recommendations

In [4]:
# Assuming your dataset is stored in a pandas DataFrame called 'ui_data'
# and 'username' is the column containing names of users, 'course' is the column containing course name

# Calculate the number of courses each user has taken
courses_per_user = ui_data.groupby('username')['course'].count().reset_index()
courses_per_user.columns = ['username', 'course_count']

# Merge the courses_per_user DataFrame back to the original DataFrame
merged_user_courses = ui_data.merge(courses_per_user, on='username')

# Sort the DataFrame by username and course to ensure consistent train-test split
user_courses = merged_user_courses.sort_values(by=['username', 'course'])

# Initialize a counter variable to keep track of the number of courses for each user
course_counter = 1

# Create a list to store the split information (True for training, False for testing)
split_list = []

# Iterate through each row to determine the split
for index, row in user_courses.iterrows():
    if course_counter < row['course_count']:
        split_list.append(True)  # Training data
    else:
        split_list.append(False)  # Testing data
        course_counter = 1  # Reset the counter for the next user
    course_counter += 1

# Add the split information to the DataFrame
user_courses['split'] = split_list

# Split the DataFrame into training and testing sets based on the 'split' column
train_ui_data = user_courses[user_courses['split']]
test_ui_data = user_courses[~user_courses['split']]

# Drop the auxiliary columns used for splitting
train_ui_data = train_ui_data.drop(['course_count', 'split'], axis=1)
test_ui_data = test_ui_data.drop(['course_count', 'split'], axis=1)

# Now train_ui_data contains rows where users learned n - 1 courses for training,
# and test_ui_data contains rows where users learned course n for testing

# Recall @K

In [6]:
recommendations = get_recommendations('Martha Long', i_data, train_ui_data, 10)
recommendations

Unnamed: 0,Course,Score
5,Build Basic Generative Adversarial Networks (G...,1.0
54,Non-Equilibrium Applications of Statistical Th...,1.0
56,Physics of silicon solar cells,0.210851
52,Medical Applications of Particle Accelerators ...,0.185854
31,Foundations of Public Health Practice: Behavio...,0.175814
62,Python and Machine Learning for Asset Management,0.17054
3,Biomedical Visualisation,0.169872
43,Introduction to International Criminal Law,0.163321
57,Political Governance and Public Policy in Russia,0.154583
21,Disease Screening in Public Health,0.152959
