1. Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import joblib

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


2. Load Dataset

In [2]:
# Load the CSV dataset
file_path = "course_recommendation_dataset.csv"  # Replace with your path if needed
df = pd.read_csv(file_path)

# Display the first few rows
df.head()

Unnamed: 0,Area of Interest,Career Goal,Preferred Learning Style,Recommended Courses
0,Cybersecurity,AR/VR Development,Visual,"Networking Basics, Encryption"
1,E-Commerce Development,Data Engineering,Reading/Writing,Payment Gateway Integration
2,DevOps,Cybersecurity,Collaborative,"CI/CD Basics, Docker, Kubernetes"
3,DevOps,AR/VR Development,Reading/Writing,"CI/CD Basics, Docker, Kubernetes"
4,Computer Vision,E-Commerce Development,Reading/Writing,"Image Processing, Object Detection"


3. Data Preprocessing

In [3]:
# Convert categorical data to numerical data
label_encoder = LabelEncoder()

# Encode the target (Recommended Courses)
df['Recommended Courses Encoded'] = label_encoder.fit_transform(df['Recommended Courses'])

# Prepare features (X) and target (y)
X = df[['Area of Interest', 'Career Goal', 'Preferred Learning Style']]
X = pd.get_dummies(X, columns=['Area of Interest', 'Career Goal', 'Preferred Learning Style'])  # One-hot encoding

y = df['Recommended Courses Encoded']


In [4]:
unique_interests = df['Area of Interest'].unique()
print(f"Unique Areas of Interest: {unique_interests}")

Unique Areas of Interest: ['Cybersecurity' 'E-Commerce Development' 'DevOps' 'Computer Vision'
 'Cloud Computing' 'Quantum Computing' 'App Development'
 'Data Engineering' 'Data Science' 'Artificial Intelligence'
 'Big Data Analytics' 'Game Development' 'Natural Language Processing'
 'Full-Stack Development' 'Machine Learning' 'IoT Development'
 'Networking' 'Blockchain' 'UI/UX Design' 'Embedded Systems'
 'AR/VR Development' 'Business Intelligence' 'Web Development'
 'Software Testing' 'Digital Marketing']


In [5]:
unique_goals = df['Career Goal'].unique()
print(f"Unique Career Goals: {unique_goals}")

Unique Career Goals: ['AR/VR Development' 'Data Engineering' 'Cybersecurity'
 'E-Commerce Development' 'IoT Development' 'Big Data Analytics'
 'Networking' 'Web Development' 'Digital Marketing' 'Blockchain'
 'Software Testing' 'Embedded Systems' 'Artificial Intelligence'
 'Game Development' 'Natural Language Processing' 'App Development'
 'Business Intelligence' 'Machine Learning' 'UI/UX Design'
 'Full-Stack Development' 'DevOps' 'Cloud Computing' 'Computer Vision'
 'Quantum Computing' 'Data Science']


In [6]:
unique_learning_styles = df['Preferred Learning Style'].unique()
print(f"Unique Learning Styles: {unique_learning_styles}")

Unique Learning Styles: ['Visual' 'Reading/Writing' 'Collaborative' 'Hands-on']


In [7]:
print(df['Area of Interest'].value_counts())
print(df['Career Goal'].value_counts())
print(df['Preferred Learning Style'].value_counts())

Area of Interest
AR/VR Development              1073
Business Intelligence          1040
Natural Language Processing    1032
Quantum Computing              1028
IoT Development                1023
Embedded Systems               1020
Machine Learning               1019
Web Development                1017
Data Engineering               1014
E-Commerce Development         1010
Data Science                   1010
Software Testing               1010
App Development                1008
Cloud Computing                1003
Blockchain                     1002
Cybersecurity                   995
Big Data Analytics              990
Artificial Intelligence         988
Game Development                984
Computer Vision                 983
UI/UX Design                    981
Digital Marketing               980
DevOps                          946
Networking                      944
Full-Stack Development          900
Name: count, dtype: int64
Career Goal
DevOps                         1047
Digital M

4. Split the Dataset

In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

5. Train and Evaluate Models
- Model 1: Logistic Regression

In [9]:
logistic_model = LogisticRegression(max_iter=200)
logistic_model.fit(X_train, y_train)

# Make predictions and evaluate
logistic_predictions = logistic_model.predict(X_test)
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
print(f"Logistic Regression Accuracy: {logistic_accuracy:.2f}")


Logistic Regression Accuracy: 1.00


Model 2: Random Forest

In [10]:
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

# Make predictions and evaluate
rf_predictions = random_forest_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy:.2f}")


Random Forest Accuracy: 1.00


Model 3: Support Vector Machine (SVM)

In [11]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Make predictions and evaluate
svm_predictions = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print(f"Support Vector Machine Accuracy: {svm_accuracy:.2f}")


Support Vector Machine Accuracy: 1.00


Model 4: K-Nearest Neighbors (KNN)

In [12]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Make predictions and evaluate
knn_predictions = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)
print(f"K-Nearest Neighbors Accuracy: {knn_accuracy:.2f}")


K-Nearest Neighbors Accuracy: 1.00


6. Save the Best Model

In [13]:
# Compare accuracies and save the best model
accuracies = {
    'Logistic Regression': logistic_accuracy,
    'Random Forest': rf_accuracy,
    'SVM': svm_accuracy,
    'KNN': knn_accuracy
}

best_model_name = max(accuracies, key=accuracies.get)
best_model = {
    'Logistic Regression': logistic_model,
    'Random Forest': random_forest_model,
    'SVM': svm_model,
    'KNN': knn_model
}[best_model_name]

# Save the best model
joblib.dump(best_model, f"best_course_recommendation_model_{best_model_name}.joblib")
print(f"Best model '{best_model_name}' saved successfully.")

Best model 'Logistic Regression' saved successfully.


In [14]:
# Save the label encoder to use during predictions
joblib.dump(label_encoder, 'label_encoder.joblib')
print("Label encoder saved successfully.")

# Save the feature columns for proper alignment during prediction
joblib.dump(X.columns, 'feature_columns.joblib')
print("Feature columns saved successfully.")


Label encoder saved successfully.
Feature columns saved successfully.
