In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Step 1: Load the dataset
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Final_data_save.csv')

R-squared: 0.6233250662880188
Mean Absolute Error: 0.355786489874944
Root Mean Squared Error: 0.5204544844645597


In [None]:
# Define a function to remove outliers using the IQR method
def remove_outliers_iqr(df, columns):
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    df_out = df[~((df[columns] < (Q1 - 1.5 * IQR)) | (df[columns] > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df_out

# Apply the function to remove outliers from 'Years of Experience', 'YearsCoding', 'CompanySize', and 'Salary'
columns_to_check = ['Years of Experience', 'YearsCoding', 'CompanySize', 'Salary']
data_cleaned = remove_outliers_iqr(data, columns_to_check)

# Display the shape of the data before and after removing outliers
original_shape = data.shape
cleaned_shape = data_cleaned.shape

original_shape, cleaned_shape


((36826, 7), (27846, 7))

In [None]:
data_cleaned.head()

Unnamed: 0,Country,Education Level,Primary Job Title,Years of Experience,YearsCoding,CompanySize,Salary
2,138,1,0,4.0,7.0,15.0,10.637585
6,123,4,0,1.0,7.0,15.0,10.776975
7,6,1,12,13.0,16.0,3000.0,11.471781
9,140,8,0,19.0,19.0,15.0,11.225257
10,53,4,15,4.0,4.0,300.0,9.301916


In [None]:
data=data_cleaned

In [None]:
# Step 2: Handle missing values if any
data.dropna(subset=['Salary'], inplace=True)

# Step 4: Apply log transformation to normalize the salary distribution
data['Salary'] = np.log1p(data['Salary'])

# Step 5: Preprocess categorical features
label_encoders = {}
for column in ['Country', 'Education Level', 'Primary Job Title']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Step 6: Define features (X) and target (y)
features = ['Country', 'Education Level', 'Primary Job Title', 'Years of Experience', 'CompanySize']
X = data[features]
y = data['Salary']


In [None]:
data.head()

Unnamed: 0,Country,Education Level,Primary Job Title,Years of Experience,YearsCoding,CompanySize,Salary


In [None]:
print(len(X))

0


In [None]:

# Step 7: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Step 8: Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 9: Initialize and train the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# Step 10: Make predictions
y_pred = rf_model.predict(X_test)

# Step 11: Evaluate the model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Step 12: Reverse the log transformation to get back to original salary scale
y_test = np.expm1(y_test)
y_pred = np.expm1(y_pred)

print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")

# Optional: Save the model for future use
# import joblib
# joblib.dump(rf_model, 'salary_prediction_model.pkl')

R-squared: 0.6027477075247913
Mean Absolute Error: 0.031484890260042225
Root Mean Squared Error: 0.04590892908211189
