<a href="https://colab.research.google.com/github/sainitin20/Personal-Portfolio-main/blob/main/Salary_Predicition_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import joblib
from google.colab import files

# Upload the dataset
print("Please upload the 'expected_ctc.csv' file:")
uploaded = files.upload()

# Load the dataset
# Assuming the uploaded file is 'expected_ctc.csv'
if 'expected_ctc.csv' not in uploaded:
    raise FileNotFoundError("Please upload a file named 'expected_ctc.csv'")
data = pd.read_csv('expected_ctc.csv')

# Define features and target
target = 'Expected_CTC'
features = [
    'Total_Experience', 'Total_Experience_in_field_applied', 'Department', 'Role',
    'Industry', 'Education', 'Graduation_Specialization', 'Passing_Year_Of_Graduation',
    'Last_Appraisal_Rating', 'No_Of_Companies_worked', 'Number_of_Publications',
    'Certifications', 'International_degree_any', 'Current_CTC'
]

# Handle missing values and preprocess categorical/numerical features
numeric_features = [
    'Total_Experience', 'Total_Experience_in_field_applied',
    'Passing_Year_Of_Graduation', 'No_Of_Companies_worked',
    'Number_of_Publications', 'Certifications', 'International_degree_any',
    'Current_CTC'
]
categorical_features = ['Department', 'Role', 'Industry', 'Education', 'Graduation_Specialization', 'Last_Appraisal_Rating']

# Imputers for missing values
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', numeric_imputer),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', categorical_imputer),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_features)
    ])

# Define the model pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Prepare data
X = data[features]
y = data[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

# Feature importance (for RandomForest)
feature_importance = model.named_steps['regressor'].feature_importances_
feature_names = (numeric_features +
                 list(model.named_steps['preprocessor']
                      .named_transformers_['cat']
                      .named_steps['encoder']
                      .get_feature_names_out(categorical_features)))

# Display feature importance
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(importance_df.head(10))

# Save the model
joblib.dump(model, 'salary_prediction_model.pkl')
print("Model saved as 'salary_prediction_model.pkl'")

# Download the model file
files.download('salary_prediction_model.pkl')

Please upload the 'expected_ctc.csv' file:


Saving expected_ctc.csv to expected_ctc.csv
Mean Absolute Error (MAE): 24132.57
Mean Squared Error (MSE): 1771054670.43
R-squared (R2): 1.00

Feature Importance:
                                Feature  Importance
7                           Current_CTC    0.985286
72              Last_Appraisal_Rating_C    0.003863
73              Last_Appraisal_Rating_D    0.003792
55                  Education_Doctorate    0.003615
57                         Education_PG    0.000745
2            Passing_Year_Of_Graduation    0.000395
74  Last_Appraisal_Rating_Key_Performer    0.000390
4                Number_of_Publications    0.000371
0                      Total_Experience    0.000308
71              Last_Appraisal_Rating_B    0.000191
Model saved as 'salary_prediction_model.pkl'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [2]:
from google.colab import files
uploaded = files.upload()  # Select 'salary_prediction_model.pkl' from your laptop

Saving salary_prediction_model.pkl to salary_prediction_model (1).pkl


In [None]:
import joblib
model = joblib.load('salary_prediction_model.pkl')

In [3]:
import pandas as pd
# Example new data (must match the feature structure of the training data)
new_data = pd.DataFrame({
    'Total_Experience': [10],
    'Total_Experience_in_field_applied': [8],
    'Department': ['Engineering'],
    'Role': ['Senior Engineer'],
    'Industry': ['Tech'],
    'Education': ['B.Tech'],
    'Graduation_Specialization': ['Computer Science'],
    'Passing_Year_Of_Graduation': [2012],
    'Last_Appraisal_Rating': ['A'],
    'No_Of_Companies_worked': [3],
    'Number_of_Publications': [2],
    'Certifications': [1],
    'International_degree_any': [0],
    'Current_CTC': [1500000]
})
prediction = model.predict(new_data)
print(f"Predicted Expected CTC: {prediction[0]:.2f}")

Predicted Expected CTC: 1877226.96
