In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
import joblib


In [2]:
model=joblib.load('Stacking_Regressor_Model.pkl')

In [3]:
#Predicting on multiple new data points
# Create diverse candidate profiles for testing

new_data = pd.DataFrame({
    # Candidate 1: Senior Data Scientist with PhD
    'Total_Experience': [12, 5, 20, 3, 15, 8],
    'Total_Experience_in_field_applied': [10, 4, 18, 2, 12, 6],
    'Department': ['Analytics/BI', 'Marketing', 'Banking', 'Engineering', 'HR', 'Sales'],
    'Role': ['Data scientist', 'Sales Manager', 'CEO', 'Consultant', 'Consultant', 'Team Lead'],
    'Industry': ['IT', 'FMCG', 'BFSI', 'Analytics', 'Training', 'Retail'],
    'Organization': ['A', 'E', 'M', 'P', 'J', 'K'],
    'Designation': ['Sr.Manager', 'Marketing Manager', 'Director', 'Assistant Manager', 'HR', 'Product Manager'],
    'Education': ['Doctorate', 'PG', 'Grad', 'PG', 'PG', 'Under Grad'],
    'Graduation_Specialization': ['Statistics', 'Economics', 'Engineering', 'Chemistry', 'Psychology', 'Arts'],
    'University_Grad': ['Delhi', 'Mumbai', 'Kolkata', 'Bangalore', 'Pune', 'Jaipur'],
    'Passing_Year_Of_Graduation': [2006, 2015, 1998, 2017, 2003, 2010],
    'PG_Specialization': ['Statistics', 'Economics', 'Engineering', 'Chemistry', 'Psychology', 'Arts'],
    'University_PG': ['Mumbai', 'Mumbai', 'Mumbai', 'Bangalore', 'Pune', 'Jaipur'],
    'Passing_Year_Of_PG': [2008, 2017, 2000, 2019, 2005, 2012],
    'PHD_Specialization': ['Statistics', 'NA', 'NA', 'NA', 'NA', 'NA'],
    'University_PHD': ['Bangalore', 'NA', 'NA', 'NA', 'NA', 'NA'],
    'Passing_Year_Of_PHD': [2012, np.nan, np.nan, np.nan, np.nan, np.nan],
    'Curent_Location': ['Bangalore', 'Mumbai', 'Delhi', 'Pune', 'Ahmedabad', 'Kolkata'],
    'Preferred_location': ['Bangalore', 'Pune', 'Mumbai', 'Bangalore', 'Pune', 'Mumbai'],
    'Current_CTC': [1800000, 800000, 3500000, 500000, 2000000, 1200000],
    'Inhand_Offer': ['Y', 'N', 'Y', 'N', 'Y', 'N'],
    'Last_Appraisal_Rating': ['Key_Performer', 'B', 'Key_Performer', 'A', 'C', 'B'],
    'No_Of_Companies_worked': [3, 2, 5, 1, 4, 3],
    'Number_of_Publications': [5, 1, 8, 2, 3, 1],
    'Certifications': [2, 0, 3, 1, 1, 0],
    'International_degree_any': [0, 0, 1, 0, 0, 0]
})

print("Sample Data Points for Prediction:")
print("\nCandidate 1: Senior Data Scientist with PhD (12 yrs exp)")
print("Candidate 2: Marketing Manager in FMCG (5 yrs exp)")
print("Candidate 3: Banking CEO with extensive exp (20 yrs)")
print("Candidate 4: Fresh Analytics Consultant (3 yrs exp)")
print("Candidate 5: HR Consultant in Training (15 yrs exp)")
print("Candidate 6: Sales Team Lead in Retail (8 yrs exp)")
print("\n" + "="*60)
print(new_data.T)  # Transpose for better readability


Sample Data Points for Prediction:

Candidate 1: Senior Data Scientist with PhD (12 yrs exp)
Candidate 2: Marketing Manager in FMCG (5 yrs exp)
Candidate 3: Banking CEO with extensive exp (20 yrs)
Candidate 4: Fresh Analytics Consultant (3 yrs exp)
Candidate 5: HR Consultant in Training (15 yrs exp)
Candidate 6: Sales Team Lead in Retail (8 yrs exp)

                                                0                  1  \
Total_Experience                               12                  5   
Total_Experience_in_field_applied              10                  4   
Department                           Analytics/BI          Marketing   
Role                               Data scientist      Sales Manager   
Industry                                       IT               FMCG   
Organization                                    A                  E   
Designation                            Sr.Manager  Marketing Manager   
Education                               Doctorate                 PG   

In [4]:
#Encoding the categorical features in the new data point
categorical_features = new_data.select_dtypes(include=['object']).columns.tolist()
encoder=joblib.load('labelencoder1.pkl')
for col in categorical_features:
    new_data[col] = encoder.fit_transform(new_data[col])


In [5]:
#Scaling the new data point (column by column, same as training)
scaler=joblib.load('scaler2.pkl')

# Scale each column separately (same way as during training)
# Use set_output to avoid feature name validation issues
from sklearn import set_config
set_config(transform_output="default")

for col in new_data.columns:
    # Transform without feature name checking
    scaled_values = scaler.fit_transform(new_data[[col]])
    new_data[col] = scaled_values.flatten()

print("Data after scaling:")
print(new_data)

Data after scaling:
   Total_Experience  Total_Experience_in_field_applied  Department      Role  \
0          0.256307                           0.248069    -1.46385  0.124035   
1         -0.939793                          -0.868243     0.87831  0.868243   
2          1.623280                           1.736486    -0.87831 -1.364382   
3         -1.281536                          -1.240347    -0.29277 -0.620174   
4          0.768922                           0.620174     0.29277 -0.620174   
5         -0.427179                          -0.496139     1.46385  1.612452   

   Industry  Organization  Designation  Education  Graduation_Specialization  \
0   0.29277      -1.46385      1.46385  -1.767767                    1.46385   
1  -0.29277      -0.87831      0.29277   0.353553                   -0.29277   
2  -0.87831       0.87831     -0.87831  -0.707107                    0.29277   
3  -1.46385       1.46385     -1.46385   0.353553                   -0.87831   
4   1.46385      -0

In [6]:
# Make predictions for all candidates using the Stacking Regressor model
predictions = model.predict(new_data)

print("\n" + "="*70)
print("PREDICTED EXPECTED CTC FOR ALL CANDIDATES")
print("="*70)

candidates = [
    "Candidate 1: Senior Data Scientist with PhD (12 yrs exp)",
    "Candidate 2: Marketing Manager in FMCG (5 yrs exp)",
    "Candidate 3: Banking CEO with extensive exp (20 yrs)",
    "Candidate 4: Fresh Analytics Consultant (3 yrs exp)",
    "Candidate 5: HR Consultant in Training (15 yrs exp)",
    "Candidate 6: Sales Team Lead in Retail (8 yrs exp)"
]

for i, (candidate, pred) in enumerate(zip(candidates, predictions), 1):
    print(f"{candidate}")
    print(f"   Predicted CTC: ₹{pred:,.2f}")
    print("-" * 70)

print("="*70)


PREDICTED EXPECTED CTC FOR ALL CANDIDATES
Candidate 1: Senior Data Scientist with PhD (12 yrs exp)
   Predicted CTC: ₹2,573,885.59
----------------------------------------------------------------------
Candidate 2: Marketing Manager in FMCG (5 yrs exp)
   Predicted CTC: ₹1,267,822.04
----------------------------------------------------------------------
Candidate 3: Banking CEO with extensive exp (20 yrs)
   Predicted CTC: ₹4,689,854.38
----------------------------------------------------------------------
Candidate 4: Fresh Analytics Consultant (3 yrs exp)
   Predicted CTC: ₹900,954.02
----------------------------------------------------------------------
Candidate 5: HR Consultant in Training (15 yrs exp)
   Predicted CTC: ₹2,513,753.64
----------------------------------------------------------------------
Candidate 6: Sales Team Lead in Retail (8 yrs exp)
   Predicted CTC: ₹1,713,583.40
----------------------------------------------------------------------
