<h2>Hybrid Recommendation Model (Content based + Collaborative Filtering)<h2>

In [1]:
import pandas as pd
import numpy as np
import joblib
import random
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
df = pd.read_csv("rwanda_students_final_v3.csv")

rf_model = joblib.load("random_forest_model.pkl")  # Classification model

In [3]:
df.head()

Unnamed: 0,student_id,age,gender,school_type,location,math_score,english_score,science_score,history_score,attendance_rate,study_hours_per_week,parental_education_level,household_income,internet_access,recommended_stream,parental_career,extracurricular_activity,interest
0,S0001,18,Male,Private,Urban,88,81,98,71,83,21,Primary,9270,Yes,STEM,Technology,Sports,Humanities
1,S0002,19,Female,Public,Urban,80,63,55,48,70,28,Secondary,9603,Yes,Healthcare,Arts,,Healthcare
2,S0003,17,Female,Private,Urban,41,45,54,77,59,24,Secondary,2860,No,Humanities,Healthcare,,Healthcare
3,S0004,19,Male,Private,Urban,85,77,61,86,63,4,Secondary,7390,No,Healthcare,Technology,Entrepreneurship Club,Healthcare
4,S0005,19,Male,Public,Urban,61,90,91,51,64,21,Secondary,7226,No,Business,Technology,,Business


In [4]:
df["extracurricular_activity"].fillna("None", inplace=True) #replace NaN with None
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["extracurricular_activity"].fillna("None", inplace=True) #replace NaN with None


Unnamed: 0,student_id,age,gender,school_type,location,math_score,english_score,science_score,history_score,attendance_rate,study_hours_per_week,parental_education_level,household_income,internet_access,recommended_stream,parental_career,extracurricular_activity,interest
0,S0001,18,Male,Private,Urban,88,81,98,71,83,21,Primary,9270,Yes,STEM,Technology,Sports,Humanities
1,S0002,19,Female,Public,Urban,80,63,55,48,70,28,Secondary,9603,Yes,Healthcare,Arts,,Healthcare
2,S0003,17,Female,Private,Urban,41,45,54,77,59,24,Secondary,2860,No,Humanities,Healthcare,,Healthcare
3,S0004,19,Male,Private,Urban,85,77,61,86,63,4,Secondary,7390,No,Healthcare,Technology,Entrepreneurship Club,Healthcare
4,S0005,19,Male,Public,Urban,61,90,91,51,64,21,Secondary,7226,No,Business,Technology,,Business


- label Encoding mappings

In [5]:
categorical_cols = ["gender", "school_type", "location", "parental_education_level", 
                    "internet_access", "parental_career", "extracurricular_activity", 'interest', 'recommended_stream']
label_mappings = {}

for col in categorical_cols:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])
    label_mappings[col] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

-  Scale features

In [6]:
for col, mapping in label_mappings.items():
    print(f"Label mapping for {col}: {mapping}")

df.head()

Label mapping for gender: {'Female': 0, 'Male': 1}
Label mapping for school_type: {'Private': 0, 'Public': 1}
Label mapping for location: {'Rural': 0, 'Urban': 1}
Label mapping for parental_education_level: {'Primary': 0, 'Secondary': 1, 'Tertiary': 2}
Label mapping for internet_access: {'No': 0, 'Yes': 1}
Label mapping for parental_career: {'Arts': 0, 'Business': 1, 'Education': 2, 'Healthcare': 3, 'Technology': 4}
Label mapping for extracurricular_activity: {'Entrepreneurship Club': 0, 'Music': 1, 'None': 2, 'Science Club': 3, 'Sports': 4}
Label mapping for interest: {'Arts': 0, 'Business': 1, 'Healthcare': 2, 'Humanities': 3, 'STEM': 4}
Label mapping for recommended_stream: {'Arts': 0, 'Business': 1, 'Healthcare': 2, 'Humanities': 3, 'STEM': 4}


Unnamed: 0,student_id,age,gender,school_type,location,math_score,english_score,science_score,history_score,attendance_rate,study_hours_per_week,parental_education_level,household_income,internet_access,recommended_stream,parental_career,extracurricular_activity,interest
0,S0001,18,1,0,1,88,81,98,71,83,21,0,9270,1,4,4,4,3
1,S0002,19,0,1,1,80,63,55,48,70,28,1,9603,1,2,0,2,2
2,S0003,17,0,0,1,41,45,54,77,59,24,1,2860,0,3,3,2,2
3,S0004,19,1,0,1,85,77,61,86,63,4,1,7390,0,2,4,0,2
4,S0005,19,1,1,1,61,90,91,51,64,21,1,7226,0,1,4,2,1


- Compute similarity matrix for collaborative filtering and train k-NN for collaborative filtering

In [7]:
features = df.drop(columns=["student_id", "recommended_stream"])
target = df["recommended_stream"]

In [8]:
# scaler = StandardScaler()
# features_scaled = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)

In [13]:
similarity_matrix = cosine_similarity(features)
knn_model = NearestNeighbors(n_neighbors=5, metric="cosine")
knn_model.fit(features)
indices, _ = knn_model.kneighbors(features, n_neighbors=5)

In [14]:
#save models
joblib.dump(indices, "similarity_indices.pkl")
joblib.dump(knn_model, "knn_recommender_model.pkl")
joblib.dump(similarity_matrix, "similarity_matrix.pkl")

['similarity_matrix.pkl']

- Hybrid Recommendation Function : Generates top-3 recommended streams using a hybrid approach. <br>
    1. Content-Based Filtering (Random Forest model) <br>
    2. Collaborative Filtering (k-NN) <br>
    3. Hybrid Ranking

In [10]:
def hybrid_recommend(student_input):
    student_df = pd.DataFrame([student_input])  

    for col in categorical_cols: #encode categorical features
        if col in student_df.columns:
            student_df[col] = student_df[col].map(label_mappings[col]).fillna(-1).astype(int)

    missing_cols = [col for col in features.columns if col not in student_df.columns] #to ensure student_df has the same columns as training features
    for col in missing_cols:
        student_df[col] = 0  

    student_df = student_df[features.columns] #reorder columns to match training order
    # student_scaled = pd.DataFrame(scaler.transform(student_df), columns=features.columns) #scale features

    predicted_stream = rf_model.predict(student_df)[0] #predict stream using content-based filtering (Random Forest)
    distances, indices = knn_model.kneighbors(student_df, n_neighbors=4)  #find similar students using collaborative filtering (k-NN)
    collaborative_recs = target.iloc[indices[0][1:]].tolist()

    #hybrid ranking: ensure 3 unique recommendations
    final_recommendations = []
    for rec in collaborative_recs:
        if rec != predicted_stream and rec not in final_recommendations:
            final_recommendations.append(rec)
        if len(final_recommendations) == 3:
            break

    # if still not up to 3 recommendations, add random one
    if len(final_recommendations) < 3:
        all_possible_streams = target.unique().tolist()
        additional_recs = [s for s in all_possible_streams if s not in final_recommendations]
        random.shuffle(additional_recs)
        final_recommendations.extend(additional_recs[: (3 - len(final_recommendations))])

    return predicted_stream, final_recommendations

- example

In [11]:
new_student = {
    "age": 19,
    "math_score": 50,
    "english_score": 85,
    "science_score": 75,
    "history_score": 90,
    "attendance_rate": 90,
    "study_hours_per_week": 15,
    "household_income": 30000,
    "gender": "Male",
    "school_type": "Private",
    "location": "Urban",
    "parental_education_level": "Tertiary",
    "internet_access": "Yes",
    "parental_career": "Business",
    "extracurricular_activity": "Music",
    "interest": "STEM"
}

# Get hybrid recommendations
predicted, recommendations = hybrid_recommend(new_student)
print(f"Predicted Track: {predicted}")
print(f"Recommended Streams: {recommendations}")

Predicted Track: 2
Recommended Streams: [2, 1, 0]


In [12]:
# Check the type of similarity_matrix
print(f"Type of similarity_matrix: {type(similarity_matrix)}")

# If it's a pandas DataFrame, print the first few rows to inspect
if isinstance(similarity_matrix, pd.DataFrame):
    print(similarity_matrix.head())

# If it's a numpy ndarray, print the shape
elif isinstance(similarity_matrix, np.ndarray):
    print(f"Shape of similarity_matrix: {similarity_matrix.shape}")


Type of similarity_matrix: <class 'numpy.ndarray'>
Shape of similarity_matrix: (1500, 1500)
