# **Data Pre-Processing**

In [6]:
import pandas as pd

In [7]:
role_skill=pd.read_csv('role_skill.csv')
user_certification=pd.read_csv('user_certification.csv')
user_skill=pd.read_csv('user_skill.csv')

In [8]:
print(role_skill.shape)
print(user_certification.shape)
print(user_skill.shape)

(150, 19)
(0, 30)
(300000, 23)


In [9]:
print(role_skill.columns)


Index(['Unnamed: 0', 'roleId_x', 'skillId', 'name_x', 'role_name',
       'skill_name', '_id_x', 'name_y', 'desc_x', 'created_at_x',
       'updated_at_x', 'roleId_y', 'skill_count', '_id_y', 'name',
       'created_at_y', 'updated_at_y', 'desc_y', 'user_count'],
      dtype='object')


In [10]:
print(user_certification.columns)


Index(['Unnamed: 0.1', '_id_x', 'userId_x', 'certificationId',
       'certificationName', 'started_at', 'completed_at', 'competency',
       'isVerified', 'imageData', 'user_name', 'total_duration', 'count',
       'name_x', 'role_id', 'joining_date', 'department', 'mail', 'created_at',
       'updated_at', 'password', 'profileImage', 'role_name', 'Unnamed: 0',
       '_id', 'name_y', 'issued_by', 'is_certificate', 'role', 'user_count'],
      dtype='object')


In [11]:
print(user_skill.columns)

Index(['Unnamed: 0.1', '_id_x', 'userId_x', 'skillId', 'score', 'user_name',
       'skill_name', 'total_duration', 'count', 'name_x', 'role_id',
       'joining_date', 'department', 'mail', 'created_at_x', 'updated_at_x',
       'password', 'profileImage', 'role_name', 'Unnamed: 0', '_id', 'desc',
       'user_count'],
      dtype='object')


NEW_TRIAL

In [12]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
unique_competency_levels = user_certification['competency'].unique()
competency_mapping = {level: i+1 for i, level in enumerate(sorted(unique_competency_levels))}
print("Dynamic Competency Mapping:", competency_mapping)

# Apply mapping to 'competency' column
user_certification['competency'] = user_certification['competency'].map(competency_mapping)

# Handle missing values after mapping
# user_certification['competency'].fillna(0, inplace=True)

# Verify the transformation
print(user_certification[['competency']].head())
print(user_certification[['competency']].shape)

# Check for duplicates and remove if necessary
user_certification = user_certification.drop_duplicates()

# ------------- Feature Engineering --------------
# Select relevant features for clustering (e.g., user_id and their certification details)
features = user_certification[['userId_x', 'certificationId', 'competency']]

# Pivot the data to create a user-certification matrix
user_cert_matrix = features.pivot_table(index='userId_x', columns='certificationId', values='competency', fill_value=0)

# Standardize the data for clustering
scaler = StandardScaler()
scaled_user_cert_matrix = scaler.fit_transform(user_cert_matrix)

Dynamic Competency Mapping: {}
Empty DataFrame
Columns: [competency]
Index: []
(0, 1)


ValueError: at least one array or dtype is required

In [35]:
user_id = '0f988104-a7b9-43e6-8cd1-1cb63d1a68fa'

# **KNN Role Based recommendation**

In [36]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
le_role = LabelEncoder()
user_skill['role_id'] = le_role.fit_transform(user_skill['role_id'])
role_skill['RoleId'] = le_role.fit_transform(role_skill['roleId_x'])

# Summarize user certifications
user_cert_summary = user_certification.groupby('userId_x')['certificationId'].count().reset_index()
user_cert_summary.columns = ['userId_x', 'certification_count']

# Summarize user skills
user_skill_summary = user_skill.groupby('userId_x')['skillId'].count().reset_index()
user_skill_summary.columns = ['userId_x', 'skill_count']

# Merge with user_certifications
user_features = pd.merge(user_skill_summary, user_cert_summary, on='userId_x', how='left').fillna(0)

# Display the user features
print(user_features.head())

                               userId_x  skill_count  certification_count
0  0001cd17-b1ef-481e-8f16-b86c03222af3           10                    5
1  000897ae-ccc9-469e-92d6-e3ed79b4b369           10                    5
2  000a146b-94c2-45e6-99ac-eb62ac2b022e           10                    5
3  000b0fad-ede6-4786-bee6-b8cefba98cf3           10                    5
4  000b87bb-2672-49e4-a4c5-ceb032b0395c           10                    5


In [5]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Load user_skill, role_skill, and user_certification data
user_skill_df = pd.read_csv("user_skill.csv")
role_skill_df = pd.read_csv("role_skill.csv")
user_certification_df = pd.read_csv("user_certification.csv")

# Create user-skill matrix
user_skill_matrix = user_skill_df.pivot_table(index='userId_x', columns='skill_name', values='score', fill_value=0)

# Map users to roles (optional, if needed for further analysis)
user_role_mapping = user_certification_df[['userId_x', 'role_name']]

# Merge user skills with role information
user_features = user_skill_matrix.merge(user_role_mapping, left_index=True, right_on='userId_x', how='left')

# Fill missing values
user_features.fillna(0, inplace=True)

# One-hot encode the 'role_name' column to convert it to numeric values
encoder = OneHotEncoder(sparse_output=False)
role_encoded = encoder.fit_transform(user_features[['role_name']])

# Create a new DataFrame with the encoded roles
role_encoded_df = pd.DataFrame(role_encoded, columns=encoder.get_feature_names_out(['role_name']))

# Concatenate the original features with the encoded role features
user_features = pd.concat([user_features.drop(columns=['role_name', 'userId_x']), role_encoded_df], axis=1)

# Make sure there are no remaining NaN values
if user_features.isnull().any().any():
    print("There are still NaN values. Investigating further.")
else:
    print("No NaN values found.")

# Apply KNN clustering (set n_neighbors to the number of similar users to find)
knn = NearestNeighbors(n_neighbors=5, algorithm='auto').fit(user_features)  # Assuming we want to find 5 similar users
distances, indices = knn.kneighbors(user_features)

# Recommend certifications based on similar users
def recommend_certifications(user_id, top_n=5):
    # Ensure the user ID exists in the dataset
    if user_id not in user_role_mapping['userId_x'].values:
        print(f"User ID {user_id} not found.")
        return []

    # Find the index of the user in the user features DataFrame
    user_index_list = user_role_mapping.index[user_role_mapping['userId_x'] == user_id].tolist()
    
    if len(user_index_list) == 0:
        print(f"User ID {user_id} not found in user features.")
        return []
    
    user_index = user_index_list[0]
    
    # Get the indices of the nearest neighbors (excluding the user themselves)
    similar_user_indices = indices[user_index][1:]  # Exclude the first element (which is the user itself)
    
    # Get the certifications of similar users
    similar_users = user_certification_df[user_certification_df['userId_x'].isin(user_role_mapping.iloc[similar_user_indices]['userId_x'])]
    
    # Recommend the most common certifications among similar users
    recommended_certifications = similar_users['certification_name'].value_counts().head(top_n).index.tolist()
    
    return recommended_certifications

# Example recommendation
user_id = '4cae88bd-a7b8-4090-a135-606f4046e89e'  # Replace with an actual user ID from your dataset
recommended_certifications = recommend_certifications(user_id)
print("Recommended Certifications for User:", recommended_certifications)


  user_features.fillna(0, inplace=True)


There are still NaN values. Investigating further.


ValueError: Input X contains NaN.
NearestNeighbors does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [10]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Load user_skill, role_skill, and user_certification data
user_skill_df = pd.read_csv("user_skill.csv")
role_skill_df = pd.read_csv("role_skill.csv")
user_certification_df = pd.read_csv("user_certification.csv")

# Create user-skill matrix
user_skill_matrix = user_skill_df.pivot_table(index='userId_x', columns='skill_name', values='score', fill_value=0)

# Create role-skill matrix
role_skill_matrix = role_skill_df.pivot_table(index='role_name', columns='skill_name', values='skill_count', fill_value=0)

# Map users to roles
user_role_mapping = user_certification_df[['userId_x', 'role_name']]

# Merge user skills with role information
user_features = user_skill_matrix.merge(user_role_mapping, left_index=True, right_on='userId_x', how='left')

# Fill missing values
user_features.fillna(0, inplace=True)

# One-hot encode the 'role_name' column to convert it to numeric values
encoder = OneHotEncoder(sparse_output=False)
role_encoded = encoder.fit_transform(user_features[['role_name']])

# Create a new DataFrame with the encoded roles
role_encoded_df = pd.DataFrame(role_encoded, columns=encoder.get_feature_names_out(['role_name']))

# Concatenate the original features with the encoded role features
user_features = pd.concat([user_features.drop(columns=['role_name', 'userId_x']), role_encoded_df], axis=1)  # Exclude 'userId_x'

# Apply KNN clustering (set n_neighbors based on the number of unique roles)
nbrs = NearestNeighbors(n_neighbors=user_certification_df['role_name'].nunique(), algorithm='auto').fit(user_features)  # Ensure numeric data
distances, indices = nbrs.kneighbors(user_features)

# Recommend certifications based on user role



Recommended Certifications for User: ['Certified Ethical Hacker', 'CompTIA Security+', 'Certified Information Systems Security Professional', 'Certified Cloud Security Professional', 'CompTIA Cybersecurity Analyst']


In [11]:
def recommend_certifications(user_id, top_n=5):
    # Get the user's role
    user_role = user_role_mapping[user_role_mapping['userId_x'] == user_id]['role_name'].values[0]
    
    # Get certifications for that role (assuming role_certification_schema is defined elsewhere)
    certifications = role_certification_schema.get(user_role, [])
    
    return certifications[:top_n]

# Example recommendation
user_id = 'd8880c16-445c-4985-871d-2dc2c0777803'  # Replace with an actual user ID
recommended_certifications = recommend_certifications(user_id)
print("Recommended Certifications for User:", recommended_certifications)

Recommended Certifications for User: ['Google Data Analytics Professional Certificate', 'AWS Certified Machine Learning - Specialty', 'IBM Certified Data Scientist', 'Certified Analytics Professional', 'Certified Ethical Hacker']


Collaborative Filtering

In [39]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Apply SVD
user_item_matrix = user_certification.pivot_table(index='userId_x', columns='certificationId',
                                                   values='competency', fill_value=0)

svd = TruncatedSVD(n_components=10)
user_item_matrix_svd = svd.fit_transform(user_item_matrix)

# Cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix_svd)

# Recommend certifications based on similar users
def recommend_collaborative(user_id, top_n=5):
    try:
        user_idx = user_item_matrix.index.get_loc(user_id)  # Get user index
    except KeyError:
        print(f"User ID {user_id} not found.")
        return []

    similar_users = user_similarity[user_idx].argsort()[-top_n:][::-1]  # Get indices of similar users

    # Get certifications done by similar users
    similar_user_ids = user_item_matrix.index[similar_users]
    recommended_certifications = user_certification[user_certification['userId_x'].isin(similar_user_ids)]

    # Return top N unique certification names
    return recommended_certifications['certificationName'].value_counts().head(top_n)

# Example usage
user_id = '0f988104-a7b9-43e6-8cd1-1cb63d1a68fa'  # Replace with an actual user ID
recommended_certifications = recommend_collaborative(user_id)
print("Recommended Certifications:", recommended_certifications)

User ID 0f988104-a7b9-43e6-8cd1-1cb63d1a68fa not found.
Recommended Certifications: []


In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Create a user-item matrix (users and their certifications)
user_item_matrix = user_certification.pivot_table(index='userId_x', columns='certificationId', values='competency', fill_value=0)

# Use SVD to determine the optimal number of components
explained_variance = []
components_range = range(1, min(user_item_matrix.shape) + 1)

for n_components in components_range:
    svd = TruncatedSVD(n_components=n_components)
    svd.fit(user_item_matrix)
    explained_variance.append(svd.explained_variance_ratio_.sum())

# Plot the explained variance
plt.figure(figsize=(10, 6))
plt.plot(components_range, explained_variance, marker='o')
plt.title('Explained Variance by Number of Components')
plt.xlabel('Number of Components')
plt.ylabel('Explained Variance Ratio')
plt.grid()
plt.axhline(y=0.95, color='r', linestyle='--')  # Line at 95% explained variance
plt.show()

# Choose the number of components dynamically
optimal_n_components = next(x[0] for x in enumerate(explained_variance) if x[1] >= 0.95) + 1

# Apply SVD with the optimal number of components
svd = TruncatedSVD(n_components=optimal_n_components)
user_item_matrix_svd = svd.fit_transform(user_item_matrix)

# Cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix_svd)

# Recommend certifications based on similar users
def recommend_collaborative(user_id, top_n=5):
    user_idx = user_item_matrix.index.get_loc(user_id)
    similar_users = user_similarity[user_idx].argsort()[-top_n:][::-1]

    # Get certifications done by similar users
    similar_user_ids = user_item_matrix.index[similar_users]
    recommended_certifications = user_certification[user_certification['userId_x'].isin(similar_user_ids)]

    return recommended_certifications['certificationName'].value_counts().head(top_n)

# Example usage
user_id = '4cae88bd-a7b8-4090-a135-606f4046e89e'
recommended_courses = recommend_collaborative(user_id)
print(recommended_courses)


Content based filtering

Based on Certification

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Create feature matrix from user certifications
# Assuming competency is numeric, otherwise you'll need to convert it to a suitable format
certification_features = user_certification[['certificationId', 'competency']].groupby('certificationId').mean()

# Cosine similarity for certifications
certification_similarity = cosine_similarity(certification_features)

# Create a mapping from certificationId to index in the feature matrix
certification_index = {certification: index for index, certification in enumerate(certification_features.index)}

# Recommend based on content similarity
def recommend_content_based(user_id, top_n=5):
    user_certifications = user_certification[user_certification['userId_x'] == user_id]['certificationId'].unique()

    # Get the indices of user certifications in the similarity matrix
    indices = [certification_index[cert] for cert in user_certifications if cert in certification_index]

    if not indices:
        print("No valid certifications found for the user.")
        return []

    # Calculate the mean similarity scores for the user's certifications
    similar_certs = certification_similarity[indices].mean(axis=0)

    # Recommend top certifications
    top_cert_indices = similar_certs.argsort()[-top_n:][::-1]
    recommended_certifications = certification_features.index[top_cert_indices]

    return user_certification[user_certification['certificationId'].isin(recommended_certifications)]['certificationName'].unique()

# Example usage
# user_id = '0006406b-24cd-4b7b-896a-20e600045c96'  # Replace with an actual user ID
recommended_certifications = recommend_content_based(user_id)
print("Recommended Certifications for User:", recommended_certifications)


No valid certifications found for the user.
Recommended Certifications for User: []
