# Imports

In [49]:
import pandas as pd
import numpy as np

# preprocessing tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import resample

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# For model evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



# Load Dataset

In [42]:
df = pd.read_csv("student-scores.csv")
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,1,Paul,Casey,paul.casey.1@gslingacademy.com,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,2,Danielle,Sandoval,danielle.sandoval.2@gslingacademy.com,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,3,Tina,Andrews,tina.andrews.3@gslingacademy.com,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,4,Tara,Clark,tara.clark.4@gslingacademy.com,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,5,Anthony,Campos,anthony.campos.5@gslingacademy.com,male,False,5,False,10,Unknown,84,77,65,65,80,74,76


# Data Cleaning and Preprocessing

In [43]:
# drop columns
df.drop(columns=['id','first_name','last_name','email'], inplace=True)
df.head()

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,male,False,5,False,10,Unknown,84,77,65,65,80,74,76


In [44]:
df.isnull().sum()

gender                        0
part_time_job                 0
absence_days                  0
extracurricular_activities    0
weekly_self_study_hours       0
career_aspiration             0
math_score                    0
history_score                 0
physics_score                 0
chemistry_score               0
biology_score                 0
english_score                 0
geography_score               0
dtype: int64

# Create New Feature

In [45]:
df["total_score"] = df["math_score"] + df["history_score"] + df["physics_score"] + df["chemistry_score"] + df["biology_score"] + df["english_score"] + df["geography_score"]
df["average_score"] = df["total_score"] / 7
df.head()

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87,574,82.0
1,female,False,2,False,47,Doctor,90,86,96,100,90,88,90,640,91.428571
2,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94,605,86.428571
3,female,False,5,False,3,Artist,71,74,88,80,89,63,86,551,78.714286
4,male,False,5,False,10,Unknown,84,77,65,65,80,74,76,521,74.428571


# Encoding

In [46]:
cols = ['gender','part_time_job','extracurricular_activities','career_aspiration']

encoder = LabelEncoder()

for col in cols:
    df[col] = encoder.fit_transform(df[col])
    
df


Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,1,0,3,0,27,9,73,81,93,97,63,80,87,574,82.000000
1,0,0,2,0,47,6,90,86,96,100,90,88,90,640,91.428571
2,0,0,9,1,13,8,81,97,95,96,65,77,94,605,86.428571
3,0,0,5,0,3,1,71,74,88,80,89,63,86,551,78.714286
4,1,0,5,0,10,15,84,77,65,65,80,74,76,521,74.428571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1,0,2,0,30,4,83,77,84,73,75,84,82,558,79.714286
1996,1,0,2,0,20,12,89,65,73,80,87,67,73,534,76.285714
1997,0,0,5,0,14,12,97,85,63,93,68,94,78,578,82.571429
1998,0,1,10,1,5,3,51,96,72,89,95,88,75,566,80.857143


# Scaling: Normalization

In [47]:
# Define the columns to scale
scale_cols = ['absence_days', 'weekly_self_study_hours', 'math_score', 'history_score',
              'physics_score', 'chemistry_score', 'biology_score',
              'english_score', 'geography_score']

scaler = StandardScaler()

df[scale_cols] = scaler.fit_transform(df[scale_cols])
df

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,1,0,-0.253175,0,0.762334,9,-0.790525,0.052463,0.930377,1.331147,-1.208673,-0.106245,0.525321,574,82.000000
1,0,0,-0.633604,0,2.411605,6,0.495250,0.445147,1.169682,1.565986,0.759435,0.559086,0.783168,640,91.428571
2,0,0,2.029397,1,-0.392155,8,-0.185454,1.309054,1.089913,1.252867,-1.062888,-0.355744,1.126964,605,86.428571
3,0,0,0.507682,0,-1.216791,1,-0.941792,-0.497296,0.531536,0.000391,0.686542,-1.520075,0.439372,551,78.714286
4,1,0,0.507682,0,-0.639546,15,0.041447,-0.261685,-1.303134,-1.173804,0.030506,-0.605244,-0.420119,521,74.428571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1,0,-0.633604,0,1.009725,4,-0.034186,-0.261685,0.212463,-0.547567,-0.333959,0.226421,0.095575,558,79.714286
1996,1,0,-0.633604,0,0.185089,12,0.419616,-1.204128,-0.664988,0.000391,0.540756,-1.187409,-0.677966,534,76.285714
1997,0,0,0.507682,0,-0.309692,12,1.024687,0.366610,-1.462670,1.018028,-0.844209,1.058085,-0.248221,578,82.571429
1998,0,1,2.409825,1,-1.051864,3,-2.454469,1.230517,-0.744756,0.704909,1.123899,0.559086,-0.506068,566,80.857143


# Data Balancing

In [48]:
df['career_aspiration'].value_counts()

career_aspiration
12    315
3     309
15    223
2     169
9     138
0     126
6     119
10     83
13     73
4      68
1      67
7      63
8      61
14     59
5      56
11     39
16     32
Name: count, dtype: int64

In [50]:
# Split the dataframe into separate DataFrames for each class
classes = df['career_aspiration'].unique()
df_list = [df[df['career_aspiration'] == label] for label in classes]

# Find the class with the maximum count
max_size = max([len(sub_df) for sub_df in df_list])

# Resample each class to match the max_size
df_upsampled = [resample(sub_df,
                         replace=True,      # sample with replacement
                         n_samples=max_size, # to match majority class
                         random_state=42)    # reproducible results
                for sub_df in df_list]

# Combine back into a single DataFrame
df = pd.concat(df_upsampled)

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df['career_aspiration'].value_counts()

career_aspiration
13    315
12    315
6     315
4     315
9     315
16    315
10    315
0     315
1     315
8     315
5     315
11    315
2     315
7     315
3     315
15    315
14    315
Name: count, dtype: int64

# Train Test Split

In [52]:
X = df.drop('career_aspiration', axis=1)
y = df['career_aspiration']

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (4284, 14)
Test shape: (1071, 14)


# Training, Testing Multiple Classifiers

In [54]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Classifier": SVC(),
    "Random Forest Classifier": RandomForestClassifier(),
    "K Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
}

# Train and evaluate each model
for name, model in models.items():
    print("="*50)
    print("Model:", name)
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Print metrics
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_rep)
    print("Confusion Matrix:\n", conf_matrix)

Model: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.40149393090569563
Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.46      0.44        61
           1       0.41      0.47      0.44        68
           2       0.27      0.13      0.18        60
           3       0.58      0.82      0.68        51
           4       0.40      0.47      0.43        62
           5       0.15      0.07      0.10        70
           6       0.45      0.48      0.47        71
           7       0.57      0.82      0.67        68
           8       0.38      0.27      0.32        67
           9       0.39      0.41      0.40        74
          10       0.31      0.25      0.28        64
          11       0.36      0.42      0.39        62
          12       0.18      0.23      0.20        48
          13       0.36      0.16      0.22        57
          14       0.48      0.77      0.59        57
          15       0.23      0.09      0.13        75
          16       0.39    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.12605042016806722
Classification Report:
               precision    recall  f1-score   support

           0       0.06      0.21      0.09        61
           1       0.00      0.00      0.00        68
           2       0.00      0.00      0.00        60
           3       0.24      0.65      0.35        51
           4       0.00      0.00      0.00        62
           5       0.00      0.00      0.00        70
           6       0.30      0.76      0.43        71
           7       0.00      0.00      0.00        68
           8       0.00      0.00      0.00        67
           9       0.00      0.00      0.00        74
          10       0.00      0.00      0.00        64
          11       0.12      0.24      0.16        62
          12       0.05      0.42      0.09        48
          13       0.00      0.00      0.00        57
          14       0.00      0.00      0.00        57
          15       0.00      0.00      0.00        75
          16       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.20821661998132587
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        61
           1       0.00      0.00      0.00        68
           2       0.00      0.00      0.00        60
           3       0.00      0.00      0.00        51
           4       0.00      0.00      0.00        62
           5       0.00      0.00      0.00        70
           6       1.00      0.48      0.65        71
           7       0.00      0.00      0.00        68
           8       0.28      0.43      0.34        67
           9       0.00      0.00      0.00        74
          10       0.15      0.62      0.24        64
          11       0.17      1.00      0.29        62
          12       0.12      0.10      0.11        48
          13       0.00      0.00      0.00        57
          14       0.17      0.61      0.26        57
          15       0.00      0.00      0.00        75
          16       0.40    

# Selecting Best Model

In [55]:
model_rfc = RandomForestClassifier()

model_rfc.fit(X_train,y_train)

y_pred = model_rfc.predict(X_test)



print("confusion matrix \n: ", confusion_matrix(y_test,y_pred))
print("classification report \n: ", classification_report(y_test, y_pred))


confusion matrix 
:  [[59  0  0  0  0  0  0  0  0  0  0  0  1  1  0  0  0]
 [ 0 68  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 50  0  0  0  1  0  0  3  0  1  2  0  1  2  0]
 [ 0  2  0 49  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 62  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 68  0  0  2  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  2  0 64  0  0  1  0  1  2  0  0  1  0]
 [ 0  3  0  0  0  0  0 65  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 66  0  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 72  0  0  0  0  0  1  1]
 [ 0  5  0  2  0  0  0  0  0  0 56  0  0  0  0  1  0]
 [ 0  0  0  0  1  0  0  0  0  0  0 61  0  0  0  0  0]
 [ 0  0  5  0  3  0  1  0  0  1  0  0 36  0  0  2  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0 55  0  2  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 57  0  0]
 [ 0  1  3  2  0  1  0  0  0  2  2  0  5  0  2 55  2]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 56]]
classification report 
:                precision    recall 

# Saving Model, Encoder, Scaler for production

In [85]:
import pickle

pickle.dump(scaler, open("models/scaler.pkl",'wb'))
pickle.dump(encoder, open("models/encoder.pkl",'wb'))
pickle.dump(model_rfc, open("models/model_rfc.pkl",'wb'))

# Inference (Prediction on new data)

In [83]:
print("Actual Class : ", encoder.inverse_transform([y_test[5010]])[0])
print("Predicted Class : ", encoder.inverse_transform(model_rfc.predict(X_test.loc[5010].values.reshape(1,-1)))[0])


Actual Class :  Construction Engineer
Predicted Class :  Construction Engineer




In [84]:
print("Actual Class : ", encoder.inverse_transform([y_test[401]])[0])
print("Predicted Class : ", encoder.inverse_transform(model_rfc.predict(X_test.loc[401].values.reshape(1,-1)))[0])


Actual Class :  Stock Investor
Predicted Class :  Stock Investor




In [86]:
X_test.columns

Index(['gender', 'part_time_job', 'absence_days', 'extracurricular_activities',
       'weekly_self_study_hours', 'math_score', 'history_score',
       'physics_score', 'chemistry_score', 'biology_score', 'english_score',
       'geography_score', 'total_score', 'average_score'],
      dtype='object')