## Model  and Model Eval

### Feature Engineering

In [1]:
import pandas as pd
import numpy as np

Load the modified datasets again

In [2]:
userdata = pd.read_csv('C:/Users/Avani/Documents/MScProj/Data/userdata.csv')
coursedata = pd.read_csv('C:/Users/Avani/Documents/MScProj/Data/coursedata.csv')
ratingdata = pd.read_csv('C:/Users/Avani/Documents/MScProj/Data/ratingdata.csv')

In [3]:
ratingdata.head()

Unnamed: 0,emp_id,emp_status,emp_path,course_name,course_id,course_rating
0,02-7699215,Active,Big Data,Azure PaaS_Beginner,3959,1
1,28-3031936,Transition Out,DS,Databases_Beginner,4329,4
2,53-5840917,Active,Cloud,Azure PaaS_Beginner,5016,3
3,58-3106403,Transition In,Web Dev,R Language_Expert,1533,3
4,75-3586738,Active,Big Data,AWS_Advanced,5392,4


Filling the missing values

In [4]:
# Impute missing course_rating with median rating
median_rating = coursedata['course_rating'].median()
coursedata['course_rating'].fillna(median_rating, inplace=True)

Splitting the data

In [5]:
from sklearn.model_selection import train_test_split

# Separating rows where `emp_status` is 'Transition In'
transition_in_data = ratingdata[ratingdata['emp_status'] == 'Transition In']
remaining_data = ratingdata[ratingdata['emp_status'] != 'Transition In']

# Calculating the required test size to achieve an overall 70:30 split when 
# including 'Transition In' rows
total_data = len(ratingdata)
total_test_size = int(0.3 * total_data)  # 30% of total data
# Subtracting the 'Transition In' rows
remaining_test_size = total_test_size - len(transition_in_data)  

# Splitting the remaining data to achieve the desired test size
train_data, test_data_temp = train_test_split(remaining_data, 
                                              test_size=remaining_test_size, 
                                              random_state=42)

# Adding the 'Transition In' rows to the test set
test_data = pd.concat([test_data_temp, transition_in_data], axis=0)

train_data.shape, test_data.shape


((55643, 6), (23846, 6))

### Encoding 

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

# One-hot encoding the relevant features from userdata
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(userdata[['emp_path', 'emp_level', 'edu_level']])
encoded_features_df = pd.DataFrame(encoded_features.toarray(), index=userdata['emp_id'])




### User similarity calculation

In [7]:
# Compute cosine similarities between users based on these features
user_feature_similarity = cosine_similarity(encoded_features_df)
user_feature_similarity_df = pd.DataFrame(user_feature_similarity, 
                                          index=encoded_features_df.index, 
                                          columns=encoded_features_df.index)

user_feature_similarity_df.head()

emp_id,02-7699215,28-3031936,53-5840917,58-3106403,75-3586738,79-2925423,08-3543359,94-2433520,37-0865020,17-1643005,...,49-1835007,05-5730112,20-5059638,41-1004569,45-8573486,39-8214360,14-9541015,17-7308435,53-3899747,48-3200612
emp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
02-7699215,1.0,0.333333,0.0,0.666667,0.666667,0.666667,0.333333,0.666667,0.333333,0.666667,...,0.0,0.333333,0.333333,0.333333,0.666667,0.0,0.0,0.333333,0.0,0.333333
28-3031936,0.333333,1.0,0.0,0.333333,0.333333,0.333333,1.0,0.333333,0.666667,0.333333,...,0.0,0.0,0.0,0.0,0.666667,0.333333,0.0,0.333333,0.0,0.333333
53-5840917,0.0,0.0,1.0,0.0,0.333333,0.0,0.0,0.333333,0.0,0.0,...,0.333333,0.0,0.0,0.333333,0.0,0.0,0.666667,0.333333,0.333333,0.666667
58-3106403,0.666667,0.333333,0.0,1.0,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,...,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.333333,0.0,0.333333
75-3586738,0.666667,0.333333,0.333333,0.333333,1.0,0.666667,0.333333,1.0,0.333333,0.666667,...,0.0,0.333333,0.333333,0.333333,0.666667,0.0,0.333333,0.333333,0.0,0.666667


### Model Build

#### Sampling users for testing:

In [8]:
#Sampling a 'transistion in' status employee(user)
sample_transition_in_user = transition_in_data['emp_id'].iloc[9]
userdata[userdata['emp_id'] == sample_transition_in_user]

Unnamed: 0,emp_id,first_name,last_name,email,emp_dob,edu_level,emp_status,emp_level,emp_path,time_per_week,asp_alt_path,skill_1,skill_2,skill_1_1evel,skill_2_1evel
52,28-1178027,Mitchael,Colbourn,mcolbourn1g@w3.org,3/9/1979,Bachelors,Transition In,7,Big Data,5,Big Data,Hadoop,Python,Expert,Expert


In [9]:
#Sampling a 'active' status employee(user)
sample_active_user = remaining_data['emp_id'].iloc[7]
userdata[userdata['emp_id'] == sample_active_user]

Unnamed: 0,emp_id,first_name,last_name,email,emp_dob,edu_level,emp_status,emp_level,emp_path,time_per_week,asp_alt_path,skill_1,skill_2,skill_1_1evel,skill_2_1evel
8,37-0865020,Killy,Cloughton,kcloughton8@harvard.edu,23/06/1992,Bachelors,Active,2,DS,5,DS,R Language,AWS,Intermediate,Intermediate


#### Essential Learning recommendations

In [10]:
# Defining a function that generates recommendations for the first category
def feature_based_recommendations(user_id, top_n_users=60, top_n_courses=6):
    
    # Getting top N most similar users
    similar_users = user_feature_similarity_df[user_id].sort_values(ascending=False)[1:top_n_users+1].index.tolist()
    
    # Getting courses rated by these users
    similar_users_ratings = train_data[train_data['emp_id'].isin(similar_users)]
    top_rated_courses = similar_users_ratings[similar_users_ratings['course_rating'] >= 4]
    
    # Recommending top N courses based on frequency among similar users
    recommended_courses = top_rated_courses['course_name'].value_counts().head(top_n_courses).index.tolist()
    
    return recommended_courses



#### Generating Recommendations for Essential Learning

In [11]:
# Generating recommendations for the sample 'sample_active_user' user
feature_based_recom_actuser = feature_based_recommendations(sample_active_user)

feature_based_recom_actuser

['Machine Learning_Beginner',
 'Machine Learning_Intermediate',
 'R Language_Intermediate',
 'Deep Learning_Beginner',
 'Hadoop_Beginner',
 'R Language_Beginner']

In [12]:
# Generating recommendations for the sample 'Transition In' user
feature_based_recom_transinuser = feature_based_recommendations(sample_transition_in_user)

feature_based_recom_transinuser

['Hadoop_Expert',
 'Hadoop_Intermediate',
 'Hadoop_Advanced',
 'Hadoop_Beginner',
 'Databases_Expert',
 'Azure IaaS_Expert']

#### Comparison of recommendations

In [13]:
# Getting the courses that the sample 'Active' user has actually completed from the testing data
actual_courses_completed_actuser = remaining_data[remaining_data['emp_id'] == sample_active_user]['course_name'].tolist()

# Checking if the recommended courses match with the actual courses
matching_courses = set(feature_based_recom_actuser) & set(actual_courses_completed_actuser)

matching_courses

{'R Language_Beginner', 'R Language_Intermediate'}

In [14]:
# Getting the courses that the sample 'Transition In' user has actually completed from the testing data
actual_courses_completed_transinuser = transition_in_data[transition_in_data['emp_id'] == sample_transition_in_user]['course_name'].tolist()

# Checking if the recommended courses match with the actual courses
matching_courses = set(feature_based_recom_transinuser) & set(actual_courses_completed_transinuser)

matching_courses


{'Hadoop_Advanced', 'Hadoop_Expert'}

#### Popular Learning Recommendations

In [15]:
# Defining a function that generates recommendations of the second category
def popularity_based_recommendations(user_id, top_n=6):
    
    # Getting the user's emp_path
    user_emp_path = userdata[userdata['emp_id'] == user_id]['emp_path'].values[0]
    
    # Filtering ratingdata by emp_path
    same_path_data = train_data[train_data['emp_path'] == user_emp_path]
    
    # Ranking courses by popularity (number of ratings and average rating)
    popular_courses = same_path_data.groupby('course_name').agg({'course_rating': ['count', 'mean']})
    popular_courses.columns = ['number_of_ratings', 'average_rating']
    popular_courses['popularity_score'] = popular_courses['number_of_ratings'] * popular_courses['average_rating']
    
    # Recommending top N courses based on popularity
    recommendations = popular_courses.sort_values(by='popularity_score', ascending=False).head(top_n).index.tolist()
    
    return recommendations



#### Generating Recommendations for Popular Learning

In [16]:

# Generating recommendations for the sample 'Active' user
popularity_recom_sample_actuser = popularity_based_recommendations(sample_active_user)

popularity_recom_sample_actuser

['Machine Learning_Beginner',
 'Machine Learning_Intermediate',
 'Machine Learning_Expert',
 'Machine Learning_Advanced',
 'Python_Beginner',
 'R Language_Intermediate']

In [17]:

# Generating recommendations for the sample 'Transition In' user
popularity_recom_transinuser = popularity_based_recommendations(sample_transition_in_user)

popularity_recom_transinuser

['Hadoop_Intermediate',
 'Hadoop_Beginner',
 'Hadoop_Expert',
 'Hadoop_Advanced',
 'SQL_Advanced',
 'Python_Beginner']

In [18]:
# Getting the courses that the sample 'Active' user has actually completed from the testing data
actual_courses_completed_actuser = remaining_data[remaining_data['emp_id'] == sample_active_user]['course_name'].tolist()

# Checking if the recommended courses match with the actual courses
matching_courses = set(popularity_recom_sample_actuser) & set(actual_courses_completed_actuser)

matching_courses

{'R Language_Intermediate'}

In [19]:
# Getting the courses that the sample 'Transition In' user has actually completed from the testing data
actual_courses_completed_transinuser = transition_in_data[transition_in_data['emp_id'] == sample_transition_in_user]['course_name'].tolist()

# Checking if the recommended courses match with the actual courses
matching_courses = set(popularity_recom_transinuser) & set(actual_courses_completed_transinuser)

matching_courses

{'Hadoop_Advanced', 'Hadoop_Expert', 'Python_Beginner'}

#### Advanced Learning Recommendations

In [20]:
# Defining the function that generates recommendations of the third category
def next_level_recommendations(user_id, top_n=6):
       
    # Got the user's current emp_level
    current_emp_level = userdata[userdata['emp_id'] == user_id]['emp_level'].values[0]
    
    # Determining the next emp_level (assuming emp_level is a continuous range from 1 to 7)
    next_emp_level = min(current_emp_level + 1, 7)  # 7 being the highest emp_level
    
    # Retrieving users of the next emp_level
    users_next_level = userdata[userdata['emp_level'] == next_emp_level]['emp_id'].tolist()
    
    # Filtering ratingdata by users of the next emp_level
    next_level_data = train_data[train_data['emp_id'].isin(users_next_level)]
    
    # Ranking the courses by popularity (number of ratings and average rating)
    popular_courses_next_level = next_level_data.groupby('course_name').agg({'course_rating': ['count', 'mean']})
    popular_courses_next_level.columns = ['number_of_ratings', 'average_rating']
    popular_courses_next_level['popularity_score'] = popular_courses_next_level['number_of_ratings'] * popular_courses_next_level['average_rating']
    
    # Recommending the top N courses based on popularity among the next level users
    recommendations = popular_courses_next_level.sort_values(by='popularity_score', ascending=False).head(top_n).index.tolist()
    
    return recommendations



#### Generating Recommendations for Advanced Learning

In [21]:
# Generating recommendations for the sample 'Active' user
next_level_recom_sample_actuser = next_level_recommendations(sample_active_user)

next_level_recom_sample_actuser


['Python_Beginner',
 'Hadoop_Beginner',
 'SQL_Beginner',
 'Hadoop_Intermediate',
 'Databases_Beginner',
 'Azure IaaS_Beginner']

In [22]:
# Generating recommendations for the sample 'Transition In' user
next_level_recom_sample_transinuser = next_level_recommendations(sample_transition_in_user)

next_level_recom_sample_transinuser

['Hadoop_Expert',
 'SQL_Advanced',
 'Hadoop_Intermediate',
 'Hadoop_Advanced',
 'Databases_Expert',
 'Power BI_Expert']

#### Alternate Learning Recommendations

In [23]:
def asp_alt_path_recommendations(user_id, top_n=6):
   
    # Getting user's asp_alt_Path
    user_asp_alt_path = userdata[userdata['emp_id'] == user_id]['asp_alt_path'].values[0]
    
    # Retrieving users whose emp_path matches the user's asp_alt_path
    users_asp_path = userdata[userdata['emp_path'] == user_asp_alt_path]['emp_id'].tolist()
    
    # Filtering ratingdata by these users
    asp_path_data = train_data[train_data['emp_id'].isin(users_asp_path)]
    
    # Ranking courses by popularity (number of ratings and average rating)
    popular_courses_asp_path = asp_path_data.groupby('course_name').agg({'course_rating': ['count', 'mean']})
    popular_courses_asp_path.columns = ['number_of_ratings', 'average_rating']
    popular_courses_asp_path['popularity_score'] = popular_courses_asp_path['number_of_ratings'] * popular_courses_asp_path['average_rating']
    
    # Recommending top N courses based on popularity
    recommendations = popular_courses_asp_path.sort_values(by='popularity_score', ascending=False).head(top_n).index.tolist()
    
    return recommendations




#### Generating Recommendations for Aspirational Learning

In [24]:
# Generating recommendations for the sample 'Active' user
asp_path_recom_sample_actuser = asp_alt_path_recommendations(sample_active_user)

asp_path_recom_sample_actuser

['Machine Learning_Beginner',
 'Machine Learning_Intermediate',
 'Machine Learning_Expert',
 'Machine Learning_Advanced',
 'Python_Beginner',
 'R Language_Intermediate']

In [25]:
# Generating recommendations for the sample 'Transition In' user
asp_path_recom_sample_transinuser = asp_alt_path_recommendations(sample_transition_in_user)

asp_path_recom_sample_transinuser

['Hadoop_Intermediate',
 'Hadoop_Beginner',
 'Hadoop_Expert',
 'Hadoop_Advanced',
 'SQL_Advanced',
 'Python_Beginner']

## Model Evaluation

In [26]:
# Generating feature-based recommendations for test users
test_users = test_data['emp_id'].unique()
feature_based_rec = {user: feature_based_recommendations(user) for user in test_users}


In [27]:
# The function to calculate metrics
def compute_metrics_for_recommendations(recommendations, actual_courses):
    
    TP = len(set(recommendations) & set(actual_courses))
    FP = len(set(recommendations) - set(actual_courses))
    FN = len(set(actual_courses) - set(recommendations))
    
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    
    return precision, recall, f1_score

# Calculating accuracy metrics for all users in the test data for feature-based recommendations
precisions = []
recalls = []
f1_scores = []

for user, rec_courses in feature_based_rec.items():
    actual_courses = test_data[test_data['emp_id'] == user]['course_name'].tolist()
    precision, recall, f1 = compute_metrics_for_recommendations(rec_courses, actual_courses)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculating average precision, recall, and F1-score
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1_score = sum(f1_scores) / len(f1_scores)

avg_precision, avg_recall, avg_f1_score


(0.10644893021849232, 0.24899124086014565, 0.1361221614000017)

In [28]:
# Defining the process to compute popular courses for each emp_path
def compute_popular_courses_for_each_path(top_n=6):
    
    # Ranking courses by popularity (number of ratings and average rating) within each emp_path
    popular_courses = train_data.groupby(['emp_path', 'course_name']).agg({'course_rating': ['count', 'mean']})
    popular_courses.columns = ['number_of_ratings', 'average_rating']
    popular_courses['popularity_score'] = popular_courses['number_of_ratings'] * popular_courses['average_rating']
    
    # Getting top N courses for each emp_path based on popularity score
    recommendations = popular_courses.groupby(level=0).apply(lambda x: x.nlargest(top_n, 'popularity_score')).reset_index(level=0, drop=True)
    return recommendations

popularity_rankings = compute_popular_courses_for_each_path()

In [29]:
# optimizing the previously defined popularity recommendations generation function in order to calculate the metrics

def optimized_popularity_based_recommendations(user_id, top_n=6):
       
    # Getting user's emp_path
    user_emp_path = userdata[userdata['emp_id'] == user_id]['emp_path'].values[0]
    
    # Getting the precomputed popular courses for the user's emp_path
    popular_courses_for_path = popularity_rankings.loc[user_emp_path].index.tolist()[:top_n]
    
    return popular_courses_for_path

# Generating popularity-based recommendations for the subset of users in the test data
# as calculating for all the users has been overloading the kernel calculation power.
subset_test_users = test_users[:500]
subset_popularity_based_rec = {user: optimized_popularity_based_recommendations(user) for user in subset_test_users}

# Calculating metrics for this subset's popularity-based recommendations
subset_popularity_precisions = []
subset_popularity_recalls = []
subset_popularity_f1_scores = []

for user, rec_courses in subset_popularity_based_rec.items():
    actual_courses = test_data[test_data['emp_id'] == user]['course_name'].tolist()
    precision, recall, f1 = compute_metrics_for_recommendations(rec_courses, actual_courses)
    subset_popularity_precisions.append(precision)
    subset_popularity_recalls.append(recall)
    subset_popularity_f1_scores.append(f1)

# Calculating average precision, recall, and F1-score for popularity-based recommendations for this test subset
avg_subset_popularity_precision = sum(subset_popularity_precisions) / len(subset_popularity_precisions)
avg_subset_popularity_recall = sum(subset_popularity_recalls) / len(subset_popularity_recalls)
avg_subset_popularity_f1_score = sum(subset_popularity_f1_scores) / len(subset_popularity_f1_scores)

avg_subset_popularity_precision, avg_subset_popularity_recall, avg_subset_popularity_f1_score


(0.09366666666666654, 0.2666333333333333, 0.13403434343434337)