Import necessary libraries for model training

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib


Load data from a CSV file into a pandas DataFrame

In [2]:
df = pd.read_csv("/Users/sanams/Downloads/jobt_Test_2/data/dataset.csv.csv")

df.drop(columns=[col for col in df.columns if col.startswith('Unnamed:')], inplace=True)
print(df["Field_of_Study"].unique())

['Arts' 'Law' 'Medicine' 'Computer Science' 'Business' 'Mathematics'
 'Engineering']


model train

In [3]:
X = df.drop([ 'Student_ID', 'Current_Job_Level'], axis=1)
y = df['Current_Job_Level']


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


categorical_cols = ['Gender', 'Field_of_Study']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

numeric_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])


pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=500,
        max_depth=8,
        min_samples_leaf=3,
        class_weight='balanced',
        random_state=42
    ))
])


X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)


crate pkl file

In [4]:
joblib.dump(pipeline, '/Users/sanams/Downloads/jobt_Test_2/model.pkl')
joblib.dump(label_encoder, '/Users/sanams/Downloads/jobt_Test_2/label_encoder.pkl')

['/Users/sanams/Downloads/jobt_Test_2/label_encoder.pkl']