In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

In [2]:
pip install -U scikit-learn imbalanced-learn


Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/96/a2/cbfb5743de748d574ffdfd557e9cb29ba4f8b8a3e07836c6c176f713de2f/scikit_learn-1.6.0-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.6.0-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting imbalanced-learn
  Obtaining dependency information for imbalanced-learn from https://files.pythonhosted.org/packages/d8/0d/c3bfccc5d460eec8ff56889802aa88f5d07280d5282b307a74558e6edc44/imbalanced_learn-0.12.4-py3-none-any.whl.metadata
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Obtaining dependency information for threadpoolctl>=3.1.0 from https://files.pythonhosted.org/packages/4b/2c/ffbf7a134b9ab11a67b0cf0726453cedd9c5043a4fe7a35d1cefa9a1bcfb/threadpoolctl-3.5.0-py3-none-any.whl.metadata
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'd:\\anaconda\\lib\\site-packages\\sklearn\\__check_build\\_check_build.cp311-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



In [3]:
import sklearn
print(sklearn.__version__)


1.3.2


In [6]:
# Load dataset
df1 = pd.read_csv("student-scores.csv")
df = df1.copy()

In [7]:
# Feature engineering
df.drop(columns=['id', 'first_name', 'last_name', 'email'], axis=1, inplace=True)
df["total_score"] = df["math_score"] + df["history_score"] + df["physics_score"] + df["chemistry_score"] + df["biology_score"] + df["english_score"] + df["geography_score"]
df["average_score"] = df["total_score"] / 7

In [8]:
# Define mapping dictionaries for categorical features
gender_map = {'male': 0, 'female': 1}
part_time_job_map = {False: 0, True: 1}
extracurricular_activities_map = {False: 0, True: 1}
career_aspiration_map = {
    'Lawyer': 0, 'Doctor': 1, 'Government Officer': 2, 'Artist': 3, 'Unknown': 4,
    'Software Engineer': 5, 'Teacher': 6, 'Business Owner': 7, 'Scientist': 8,
    'Banker': 9, 'Writer': 10, 'Accountant': 11, 'Designer': 12,
    'Construction Engineer': 13, 'Game Developer': 14, 'Stock Investor': 15,
    'Real Estate Developer': 16
}

In [9]:
# Apply mapping
df['gender'] = df['gender'].map(gender_map)
df['part_time_job'] = df['part_time_job'].map(part_time_job_map)
df['extracurricular_activities'] = df['extracurricular_activities'].map(extracurricular_activities_map)
df['career_aspiration'] = df['career_aspiration'].map(career_aspiration_map)

In [10]:
# Handling class imbalance using SMOTE
X = df.drop('career_aspiration', axis=1)
y = df['career_aspiration']
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [11]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [12]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [13]:
# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

In [14]:
# Save the model, scaler, and metadata (class names)
metadata = {
    'class_names': ['Lawyer', 'Doctor', 'Government Officer', 'Artist', 'Unknown',
                    'Software Engineer', 'Teacher', 'Business Owner', 'Scientist',
                    'Banker', 'Writer', 'Accountant', 'Designer',
                    'Construction Engineer', 'Game Developer', 'Stock Investor',
                    'Real Estate Developer']
}

pickle.dump(scaler, open("Models/scaler.pkl", 'wb'))
pickle.dump(model, open("Models/model.pkl", 'wb'))
pickle.dump(metadata, open("Models/metadata.pkl", 'wb'))

print("Model, scaler, and metadata saved successfully.")

Model, scaler, and metadata saved successfully.


In [15]:
# Hybrid Recommendation System
def hybrid_recommendations(features, content_based_data):
    scaler = pickle.load(open("Models/scaler.pkl", 'rb'))
    model = pickle.load(open("Models/model.pkl", 'rb'))
    metadata = pickle.load(open("Models/metadata.pkl", 'rb'))
    
    class_names = metadata['class_names']
    
    # Encode categorical variables
    gender_encoded = 1 if features[0].lower() == 'female' else 0
    part_time_job_encoded = 1 if features[1] else 0
    extracurricular_activities_encoded = 1 if features[2] else 0
    
    feature_array = np.array([[gender_encoded, part_time_job_encoded, features[3], extracurricular_activities_encoded,
                               features[4], features[5], features[6], features[7],
                               features[8], features[9], features[10], features[11], features[12], features[13], features[14]]])
    
    scaled_features = scaler.transform(feature_array)
    
    # Collaborative filtering using Random Forest for similarity-based recommendation
    probs = model.predict_proba(scaled_features)
    top_classes_idx = np.argsort(-probs[0])[:5]
    top_classes_names_probs = [(class_names[idx], probs[0][idx]) for idx in top_classes_idx]
    
    # Content-based filtering: Get recommendations based on career aspirations and content similarity
    user_career = metadata['class_names'][features[14]]
    content_recommendations = content_based_data[user_career].sort_values(ascending=False).head(5)
    
    hybrid_recommendations = []
    for i, (class_name, prob) in enumerate(top_classes_names_probs):
        hybrid_recommendations.append((class_name, prob))
    
    # Add content-based recommendations and ensure no duplication
    for class_name, prob in content_recommendations.items():
        if class_name not in [x[0] for x in hybrid_recommendations]:
            hybrid_recommendations.append((class_name, prob))
    
    return hybrid_recommendations

In [19]:
features = ['female', False, 2, 7, 65, 60, 97, 94, 71, 81, 66, 534, 76.29, 0, 0]

# Ensure all necessary features are present
if len(features) < 15:
    raise ValueError("Not enough features provided. Expected 15 features.")

# Example feature extraction and preprocessing
gender_encoded = 1 if features[0] == 'female' else 0  # Convert 'female' to 1, 'male' to 0
part_time_job_encoded = 1 if features[1] else 0  # Convert Boolean to 1 or 0
extracurricular_activities_encoded = 1 if features[2] else 0  # Convert Boolean to 1 or 0

# Extract features that are needed
feature_array = np.array([[gender_encoded, part_time_job_encoded, features[3], extracurricular_activities_encoded,
                           features[4], features[5], features[6], features[7],
                           features[8], features[9], features[10], features[11], features[12], features[13]]])  # Only 14 features

# Scale the features using StandardScaler
scaled_features = scaler.transform(feature_array)






In [31]:
print(f"Features: {features}")


Features: ['female', False, 2, 7, 65, 60, 97, 94, 71, 81, 66, 534, 76.29]


In [39]:
# Features list
features = ['female', False, 2, 7, 65, 60, 97, 94, 71, 81, 66, 534, 76.29]

# Feature extraction
feature_array = np.array([
    gender_encoded, part_time_job_encoded, features[2], extracurricular_activities_encoded,
    features[3], features[4], features[5], features[6],
    features[7], features[8], features[9], features[10], features[11], features[12]
]).reshape(1, -1)

print("Feature array:", feature_array)



Feature array: [[  1.     0.     2.     1.     7.    65.    60.    97.    94.    71.
   81.    66.   534.    76.29]]


In [36]:
print("Features List:", features)


Features List: ['female', False, 2, 7, 65, 60, 97, 94, 71, 81, 66, 534, 76.29]
