In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

df1 = pd.read_csv("student-scores.csv")
df = df1.copy()
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,1,Paul,Casey,paul.casey.1@gslingacademy.com,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,2,Danielle,Sandoval,danielle.sandoval.2@gslingacademy.com,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,3,Tina,Andrews,tina.andrews.3@gslingacademy.com,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,4,Tara,Clark,tara.clark.4@gslingacademy.com,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,5,Anthony,Campos,anthony.campos.5@gslingacademy.com,male,False,5,False,10,Unknown,84,77,65,65,80,74,76


In [2]:
df.columns
df.drop(columns=['id','first_name','last_name','email'],axis=1, inplace=True)

In [3]:
df["total_score"] = df["math_score"] + df["history_score"] + df["physics_score"] + df["chemistry_score"] + df["biology_score"] + df["english_score"] + df["geography_score"]
df["average_score"] = df["total_score"] / 7
df.head()

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87,574,82.0
1,female,False,2,False,47,Doctor,90,86,96,100,90,88,90,640,91.428571
2,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94,605,86.428571
3,female,False,5,False,3,Artist,71,74,88,80,89,63,86,551,78.714286
4,male,False,5,False,10,Unknown,84,77,65,65,80,74,76,521,74.428571


In [4]:
df["career_aspiration"].unique()

array(['Lawyer', 'Doctor', 'Government Officer', 'Artist', 'Unknown',
       'Software Engineer', 'Teacher', 'Business Owner', 'Scientist',
       'Banker', 'Writer', 'Accountant', 'Designer',
       'Construction Engineer', 'Game Developer', 'Stock Investor',
       'Real Estate Developer'], dtype=object)

In [5]:
gender_map = {'male': 0, 'female': 1}
part_time_job_map = {False: 0, True: 1}
extracurricular_activities_map = {False: 0, True: 1}
career_aspiration_map = {
        'Lawyer': 0, 'Doctor': 1, 'Government Officer': 2, 'Artist': 3, 'Unknown': 4,
        'Software Engineer': 5, 'Teacher': 6, 'Business Owner': 7, 'Scientist': 8,
        'Banker': 9, 'Writer': 10, 'Accountant': 11, 'Designer': 12,
        'Construction Engineer': 13, 'Game Developer': 14, 'Stock Investor': 15,
        'Real Estate Developer': 16
    }

# Apply mapping to the DataFrame
df['gender'] = df['gender'].map(gender_map)
df['part_time_job'] = df['part_time_job'].map(part_time_job_map)
df['extracurricular_activities'] = df['extracurricular_activities'].map(extracurricular_activities_map)
df['career_aspiration'] = df['career_aspiration'].map(career_aspiration_map)

In [6]:
df.head()

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,0,0,3,0,27,0,73,81,93,97,63,80,87,574,82.0
1,1,0,2,0,47,1,90,86,96,100,90,88,90,640,91.428571
2,1,0,9,1,13,2,81,97,95,96,65,77,94,605,86.428571
3,1,0,5,0,3,3,71,74,88,80,89,63,86,551,78.714286
4,0,0,5,0,10,4,84,77,65,65,80,74,76,521,74.428571


In [7]:
df['career_aspiration'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16])

In [8]:
df['career_aspiration'].value_counts()

career_aspiration
5     315
7     309
4     223
9     169
0     138
11    126
1     119
16     83
15     73
13     68
3      67
14     63
2      61
6      59
12     56
8      39
10     32
Name: count, dtype: int64

In [11]:
smote = SMOTE(random_state=42)

x = df.drop('career_aspiration', axis=1)
y = df['career_aspiration']

x_resampled, y_resampled = smote.fit_resample(x, y)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x_resampled,y_resampled,test_size=0.2, random_state=42)

In [13]:
x_train.shape

(4284, 14)

In [16]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform both training and testing data
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [17]:
x_train_scaled.shape

(4284, 14)

In [18]:
model = RandomForestClassifier()

model.fit(x_train_scaled, y_train)
# Predict on test set
y_pred = model.predict(x_test_scaled)

# Calculate metrics
print("Accuracy: ",accuracy_score(y_test, y_pred))
print("Report: ",classification_report(y_test, y_pred))
print("Confusion Matrix: \n",confusion_matrix(y_test, y_pred))

Accuracy:  0.8291316526610645
Report:                precision    recall  f1-score   support

           0       0.76      0.84      0.80        68
           1       0.76      0.99      0.86        72
           2       0.75      0.96      0.85        57
           3       0.90      0.95      0.92        58
           4       0.82      0.48      0.61        66
           5       0.48      0.32      0.38        76
           6       0.91      0.99      0.95        71
           7       0.97      0.95      0.96        61
           8       0.76      0.98      0.86        53
           9       0.73      0.70      0.72        61
          10       0.90      0.97      0.93        63
          11       0.91      0.74      0.81        53
          12       0.89      0.87      0.88        68
          13       0.79      0.89      0.84        55
          14       0.92      1.00      0.96        57
          15       0.91      0.79      0.85        63
          16       0.95      0.81      0.8

In [19]:
# test 1
print("Actual Label :", y_test.iloc[10])
print("Model Prediction :",model.predict(x_test_scaled[10].reshape(1,-1))[0])
if y_test.iloc[10]==model.predict(x_test_scaled[10].reshape(1,-1)):
    print("Wow! Model doing well.....")
else:
    print("not sure......")

Actual Label : 12
Model Prediction : 12
Wow! Model doing well.....


In [20]:
# test 2
print("Actual Label :", y_test.iloc[300])
print("Model Prediction :",model.predict(x_test_scaled[300].reshape(1,-1))[0])
if y_test.iloc[10]==model.predict(x_test_scaled[10].reshape(1,-1)):
    print("Wow! Model doing well.....")
else:
    print("not sure......")

Actual Label : 0
Model Prediction : 0
Wow! Model doing well.....


In [21]:
# test 2
print("Actual Label :", y_test.iloc[23])
print("Model Prediction :",model.predict(x_test_scaled[23].reshape(1,-1))[0])
if y_test.iloc[10]==model.predict(x_test_scaled[10].reshape(1,-1)):
    print("Wow! Model doing well.....")
else:
    print("not sure......")

Actual Label : 3
Model Prediction : 3
Wow! Model doing well.....
