In [None]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('/Users/sanams/Downloads/jobt_Test/data/dataset.csv.csv')

for col in data.columns:
    if data[col].dtype == 'object':
        data[col].fillna(data[col].mode()[0], inplace=True)
    else:
        data[col].fillna(data[col].mean(), inplace=True)

label_encoders = {}
for col in ['Gender', 'Field_of_Study']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

with open('../label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

feature_order = ["University_Ranking", "University_GPA", "Field_of_Study",
                 "Projects_Completed", "Internships_Completed", "Certifications"]
X = data[feature_order]
y = data['Current_Job_Level']

with open('../feature_order.pkl', 'wb') as f:
    pickle.dump(feature_order, f)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

lr = LinearRegression()
lr.fit(X_train, y_train)

rf = RandomForestRegressor(random_state=42, max_depth=2)
rf.fit(X_train, y_train)

models = {'Linear Regression': lr, 'Random Forest': rf}
results = []

for name, model in models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append({'Model': name, 'Test MSE': mse, 'Test R2': r2})

results_df = pd.DataFrame(results)
print(results_df)

best_model_name = results_df.loc[results_df['Test MSE'].idxmin(), 'Model']
print(f"\n✅ Best model: {best_model_name}")
best_model = models[best_model_name]

with open('../model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print("✅ Best model saved as model.pkl")


               Model  Test MSE   Test R2
0  Linear Regression  1.384431 -0.003271
1      Random Forest  1.384965 -0.003658

✅ Best model: Linear Regression
✅ Best model saved as model.pkl


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
