### Load Dataset

In [None]:
import pandas as pd # type: ignore
df1 = pd.read_csv("student-scores.csv")
df = df1.copy()
df.head()



Unnamed: 0,id,first_name,last_name,email,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,1,Paul,Casey,paul.casey.1@gslingacademy.com,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,2,Danielle,Sandoval,danielle.sandoval.2@gslingacademy.com,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,3,Tina,Andrews,tina.andrews.3@gslingacademy.com,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,4,Tara,Clark,tara.clark.4@gslingacademy.com,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,5,Anthony,Campos,anthony.campos.5@gslingacademy.com,male,False,5,False,10,Unknown,84,77,65,65,80,74,76


### Drop irrelevat columns

In [2]:
df.drop(columns=['id','first_name','last_name','email'],axis=1, inplace=True)

### Create new features from all score

In [4]:
df["total_score"] = df["math_score"] + df["history_score"] + df["physics_score"] + df["chemistry_score"] + df["biology_score"] + df["english_score"] + df["geography_score"]
df["average_score"] = df["total_score"] / 7
df.head()

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87,574,82.0
1,female,False,2,False,47,Doctor,90,86,96,100,90,88,90,640,91.428571
2,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94,605,86.428571
3,female,False,5,False,3,Artist,71,74,88,80,89,63,86,551,78.714286
4,male,False,5,False,10,Unknown,84,77,65,65,80,74,76,521,74.428571


In [5]:
len(df['career_aspiration'].unique())

17

### Encoding categorical columns

In [6]:
gender_map = {'male' : 0,'female' : 1}
part_time_job_map = {False: 0, True: 1}
extracurricular_activities_map = {False: 0, True: 1}
career_aspiration_map = {
    'Lawyer' : 0,'Doctor' : 1, 'Government Officer' : 2, 'Artist' : 3, 'Unknown' : 4,
    'Software Engineer' : 5,'Teacher' : 6, 'Business Owner' : 7, 'Scientist' : 8,
    'Banker' : 9, 'Writer' : 10, 'Accountant' : 11, 'Designer' : 12,
    'Construction Engineer' : 13, 'Game Developer' : 14, 'Stock Investor' : 15,
    'Real Estate Developer' : 16
}

df['gender'] = df['gender'].map(gender_map)
df['part_time_job'] = df['part_time_job'].map(part_time_job_map)
df['extracurricular_activities'] = df['extracurricular_activities'].map(extracurricular_activities_map)
df['career_aspiration'] = df['career_aspiration'].map(career_aspiration_map)

In [7]:
df

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,0,0,3,0,27,0,73,81,93,97,63,80,87,574,82.000000
1,1,0,2,0,47,1,90,86,96,100,90,88,90,640,91.428571
2,1,0,9,1,13,2,81,97,95,96,65,77,94,605,86.428571
3,1,0,5,0,3,3,71,74,88,80,89,63,86,551,78.714286
4,0,0,5,0,10,4,84,77,65,65,80,74,76,521,74.428571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,2,0,30,13,83,77,84,73,75,84,82,558,79.714286
1996,0,0,2,0,20,5,89,65,73,80,87,67,73,534,76.285714
1997,1,0,5,0,14,5,97,85,63,93,68,94,78,578,82.571429
1998,1,1,10,1,5,7,51,96,72,89,95,88,75,566,80.857143


In [8]:
df.shape

(2000, 15)

### Balance dataset

In [9]:
df['career_aspiration'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16])

In [10]:
df['career_aspiration'].value_counts()

career_aspiration
5     315
7     309
4     223
9     169
0     138
11    126
1     119
16     83
15     73
13     68
3      67
14     63
2      61
6      59
12     56
8      39
10     32
Name: count, dtype: int64

In [None]:
from imblearn.over_sampling import SMOTE # type: ignore
smote = SMOTE(random_state=42)
X = df.drop('career_aspiration', axis=1)
y = df['career_aspiration']
X_resampled, y_resampled = smote.fit_resample(X,y)

In [12]:
y_resampled.value_counts()

career_aspiration
0     315
1     315
2     315
3     315
4     315
5     315
6     315
7     315
8     315
9     315
10    315
11    315
12    315
13    315
14    315
15    315
16    315
Name: count, dtype: int64

### Train test split

In [None]:
from sklearn.model_selection import train_test_split # type: ignore
X_train,X_test,y_train,y_test = train_test_split(X_resampled,y_resampled,test_size=0.2, random_state=42)

In [14]:
X_train.shape

(4284, 14)

### Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler # type: ignore
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [16]:
X_train_scaled.shape

(4284, 14)

### Models training (multiple models)

In [None]:
from sklearn.linear_model import LogisticRegression # type: ignore
from sklearn.svm import SVC # type: ignore
from sklearn.ensemble import RandomForestClassifier # type: ignore
from sklearn.neighbors import KNeighborsClassifier # type: ignore
from sklearn. tree import DecisionTreeClassifier # type: ignore
from sklearn.naive_bayes import GaussianNB # type: ignore
from sklearn.ensemble import AdaBoostClassifier # type: ignore
from sklearn.ensemble import GradientBoostingClassifier # type: ignore
from xgboost import XGBClassifier # type: ignore
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # type: ignore
import warnings
warnings.filterwarnings("ignore")

models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Classifier": SVC(),
    "Random Forest Classifier": RandomForestClassifier(),
    "K Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "XGBoost Classifier": XGBClassifier(use_label_encoder=False,  eval_metric='mlogloss')
}

for name, model in models.items():
    print("="*50)
    print("Model:", name)
    model.fit(X_train_scaled, y_train) # taraining model
    
    y_pred = model.predict(X_test_scaled) # testing model
    
    accuracy = accuracy_score(y_test, y_pred) #calculating
    classification_rep = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_rep)
    print("Confusion Matrix:\n", conf_matrix)



Model: Logistic Regression
Accuracy: 0.48739495798319327
Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.54      0.49        68
           1       0.49      0.62      0.55        72
           2       0.42      0.44      0.43        57
           3       0.52      0.57      0.55        58
           4       0.31      0.17      0.22        66
           5       0.32      0.32      0.32        76
           6       0.58      0.92      0.71        71
           7       0.83      0.90      0.87        61
           8       0.41      0.45      0.43        53
           9       0.29      0.10      0.15        61
          10       0.59      0.71      0.65        63
          11       0.44      0.45      0.45        53
          12       0.31      0.16      0.21        68
          13       0.38      0.49      0.43        55
          14       0.61      0.93      0.74        57
          15       0.37      0.24      0.29        63


### Model selection (random forest)

In [18]:
model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
print("Accuracy: ",accuracy_score(y_test, y_pred))
print("Report: ",classification_report(y_test, y_pred))
print("Confusion Matrix: ",confusion_matrix(y_test, y_pred))

Accuracy:  0.8366013071895425
Report:                precision    recall  f1-score   support

           0       0.77      0.88      0.82        68
           1       0.79      1.00      0.88        72
           2       0.79      0.98      0.88        57
           3       0.89      0.95      0.92        58
           4       0.82      0.42      0.56        66
           5       0.53      0.34      0.42        76
           6       0.93      1.00      0.97        71
           7       0.95      0.93      0.94        61
           8       0.75      0.98      0.85        53
           9       0.69      0.70      0.70        61
          10       0.91      1.00      0.95        63
          11       0.93      0.75      0.83        53
          12       0.92      0.88      0.90        68
          13       0.79      0.89      0.84        55
          14       0.90      1.00      0.95        57
          15       0.91      0.79      0.85        63
          16       0.92      0.83      0.8

### Single input predictions

In [19]:
#test1
print("Actual Label : ",y_test.iloc[10])
print("Model Prediction : ",model.predict(X_test_scaled[10].reshape(1,-1))[0])
if y_test.iloc[10] == model.predict(X_test_scaled[10].reshape(1,-1)):
    print("Wow! Model doing well.....")
else: 
    print("not sure..........")

Actual Label :  12
Model Prediction :  12
Wow! Model doing well.....


In [20]:
#test2
print("Actual Label : ",y_test.iloc[300])
print("Model Prediction : ",model.predict(X_test_scaled[300].reshape(1,-1))[0])
if y_test.iloc[10] == model.predict(X_test_scaled[10].reshape(1,-1)):
    print("Wow! Model doing well.....")
else: 
    print("not sure..........")

Actual Label :  0
Model Prediction :  0
Wow! Model doing well.....


In [21]:
#test3
#test1
print("Actual Label : ",y_test.iloc[23])
print("Model Prediction : ",model.predict(X_test_scaled[23].reshape(1,-1))[0])
if y_test.iloc[10] == model.predict(X_test_scaled[10].reshape(1,-1)):
    print("Wow! Model doing well.....")
else: 
    print("not sure..........")

Actual Label :  3
Model Prediction :  3
Wow! Model doing well.....


### Saving and load files

In [None]:
import pickle
import joblib # type: ignore
pickle.dump(scaler,open("scaler.pkl",'wb'))
pickle.dump(model,open("model_compressed.pkl",'wb'))

In [23]:
scaler = pickle.load(open("scaler.pkl",'rb'))
model = pickle.load(open("model_compressed.pkl",'rb'))

### Recommendation System

In [None]:
import pickle
import numpy as np  # type: ignore
scaler = pickle.load(open("scaler.pkl",'rb'))
model = pickle.load(open("model_compressed.pkl",'rb'))
class_names = ['Lawyer', 'Doctor', 'Government Officer', 'Artist', 'Unknown',
               'Software Engineer', 'Teacher', 'Business Owner', 'Scientist',
               'Banker', 'Writer', 'Accountant', 'Designer',
               'Construction Engineer', 'Game Developer', 'Stock Investor',
               'Real Estate Developer']

def Recommendations(gender, part_time_job, absence_days, extracurricular_activities,
                    weekly_self_study_hours, math_score, history_score, physics_score,
                    chemistry_score, biology_score, english_score, geography_score,
                    total_score,average_score):

    gender_encoded = 1 if gender.lower() == 'female' else 0
    part_time_job_encoded = 1 if part_time_job else 0
    extracurricular_activities_encoded = 1 if extracurricular_activities else 0

    feature_array = np.array([[gender_encoded, part_time_job_encoded, absence_days, extracurricular_activities_encoded,
                               weekly_self_study_hours, math_score, history_score, physics_score,
                               chemistry_score, biology_score, english_score, geography_score,total_score,average_score]])
    
    scaled_features = scaler.transform(feature_array)

    probabilities = model.predict_proba(scaled_features)

    top_classes_idx = np.argsort(-probabilities[0])[:5]
    top_classes_names_probs = [(class_names[idx], probabilities[0][idx]) for idx in top_classes_idx]

    return top_classes_names_probs

In [25]:
#Example 1
final_recommendations = Recommendations(gender='female',
                                        part_time_job=False,
                                        absence_days=2,
                                        extracurricular_activities=False,
                                        weekly_self_study_hours=7,
                                        math_score=65,
                                        history_score=60,
                                        physics_score=97,
                                        chemistry_score=94,
                                        biology_score=71,
                                        english_score=81,
                                        geography_score=66,
                                        total_score=534,
                                        average_score=76.285714)

print("Top recommended studies with probabilities:")
print("="*50)
for class_name, probability in final_recommendations:
    print(f"{class_name} with probability {probability}")


Top recommended studies with probabilities:
Teacher with probability 0.79
Unknown with probability 0.12
Government Officer with probability 0.02
Business Owner with probability 0.02
Stock Investor with probability 0.02


In [26]:
#Example 2
final_recommendations = Recommendations(gender='female',
                                        part_time_job=False,
                                        absence_days=2,
                                        extracurricular_activities=False,
                                        weekly_self_study_hours=4,
                                        math_score=87,
                                        history_score=73,
                                        physics_score=98,
                                        chemistry_score=91,
                                        biology_score=79,
                                        english_score=60,
                                        geography_score=77,
                                        total_score=583,
                                        average_score=83.285714)

print("Top recommended studies with probabilities:")
print("="*50)
for class_name, probability in final_recommendations:
    print(f"{class_name} with probability {probability}")



Top recommended studies with probabilities:
Artist with probability 0.62
Game Developer with probability 0.23
Real Estate Developer with probability 0.07
Construction Engineer with probability 0.02
Designer with probability 0.01


In [None]:
import sklearn # type: ignore
print(sklearn.__version__)


1.6.1
