In [None]:
# Model Training & Testing
# === CELL 1: Install Libraries ===
!pip install xgboost scikit-learn imbalanced-learn joblib numpy pandas matplotlib seaborn

In [3]:
# === CELL 2: Import Libraries and Load your CSV file ===
import pandas as pd
import numpy as np
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt

# Load your dataset
data = pd.read_csv("/content/salary_prediction_data.csv")  # Update with your filename

In [4]:
# === CELL 3: Feature Engineeering ===

# Feature Engineering Enhancements
data['Experience_Per_Age'] = data['Experience'] / (data['Age'] - 18 + 1e-5)  # Avoid division by zero
data['Manager_Director'] = data['Job_Title'].isin(['Manager', 'Director']).astype(int)
data['PhD_Experience'] = (data['Education'] == 'PhD') * data['Experience']
data['Experience_Sq'] = data['Experience']**2

# Career stage flags
data['Early_Career'] = (data['Experience'] <= 5).astype(int)
data['Mid_Career'] = ((data['Experience'] > 5) & (data['Experience'] <= 15)).astype(int)
data['Late_Career'] = (data['Experience'] > 15).astype(int)

# NEW FEATURES START HERE
data['Education_Job_Interaction'] = data['Education'] + "_" + data['Job_Title']
data['Experience_Group'] = pd.cut(data['Experience'],
                                 bins=[0, 2, 5, 10, 20, 50],
                                 labels=['Entry', 'Junior', 'Mid', 'Senior', 'Executive'])
data['HighExp_LowEdu'] = ((data['Experience'] > 15) &
                         (data['Education'].isin(['High School', 'Bachelor']))).astype(int)
data['LowExp_HighEdu'] = ((data['Experience'] < 5) &
                         (data['Education'].isin(['Master', 'PhD']))).astype(int)

# Show new features
print("New Features:", data.columns.tolist()[-7:])

New Features: ['Early_Career', 'Mid_Career', 'Late_Career', 'Education_Job_Interaction', 'Experience_Group', 'HighExp_LowEdu', 'LowExp_HighEdu']


In [5]:
# === CELL 4: All types of Categorical Features & Preprocessor Creation ===

# Define categories explicitly for ALL categorical features
education_cats = ['High School', 'Bachelor', 'Master', 'PhD']
location_cats = ['Urban', 'Suburban', 'Rural']
job_cats = ['Manager', 'Director', 'Analyst', 'Engineer']
gender_cats = ['Male', 'Female']

# For new categorical features, define their possible values
edu_job_interaction_cats = sorted(data['Education_Job_Interaction'].unique())
experience_group_cats = ['Entry', 'Junior', 'Mid', 'Senior', 'Executive']

# Combine all category lists
all_cats = [
    education_cats,          # For 'Education'
    location_cats,           # For 'Location'
    job_cats,                # For 'Job_Title'
    gender_cats,             # For 'Gender'
    edu_job_interaction_cats, # For 'Education_Job_Interaction'
    experience_group_cats    # For 'Experience_Group'
]

# Separate features and target
X = data.drop("Salary", axis=1)
y = np.log1p(data["Salary"])  # Log transformation

# Column lists
numerical_features = ['Experience', 'Age', 'Experience_Per_Age', 'Experience_Sq',
                      'PhD_Experience', 'Early_Career', 'Mid_Career', 'Late_Career',
                      'Manager_Director', 'HighExp_LowEdu', 'LowExp_HighEdu']

categorical_features = ['Education', 'Location', 'Job_Title', 'Gender',
                        'Education_Job_Interaction', 'Experience_Group']

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat',
         OneHotEncoder(categories=all_cats, handle_unknown='ignore'),
         categorical_features),
        ('num',
         StandardScaler(),
         numerical_features)
    ])

print("Preprocessor created successfully!")

# Test fitting
try:
    preprocessor.fit(X.head())
    print("Preprocessor fitted successfully on sample data!")

    # Get feature names
    cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()
    all_feature_names = cat_feature_names + numerical_features
    print(f"\nTotal features: {len(all_feature_names)}")
    print("First 5 features:", all_feature_names[:5])

except Exception as e:
    print(f"Error during fitting: {e}")

Preprocessor created successfully!
Preprocessor fitted successfully on sample data!

Total features: 45
First 5 features: ['Education_High School', 'Education_Bachelor', 'Education_Master', 'Education_PhD', 'Location_Urban']


In [None]:
# === CELL 5: Model Pipeline ===

# SIMPLIFIED MODEL PIPELINE
from xgboost import XGBRegressor

# Best parameters from our tuning
final_model = XGBRegressor(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=400,
    subsample=0.8,
    random_state=42
)

# Full pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', final_model)
])

In [None]:
# === CELL 6: Stratified split by Job_Title ===
# Stratified split by Job_Title
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15,
    stratify=X['Job_Title'],
    random_state=42
)

print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

Train size: 850, Test size: 150


In [None]:
# === CELL 7: SIMPLIFIED MODEL TRAINING ===
from xgboost import XGBRegressor

# Use best parameters from your grid search
best_params = {
    'max_depth': 6,
    'learning_rate': 0.1,
    'n_estimators': 100
}

# Create optimized model
model = XGBRegressor(**best_params, random_state=42)

# Create pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', model)
])

# Train model
pipeline.fit(X_train, y_train)

# Set as best pipeline
best_pipeline = pipeline

In [None]:
# === CELL 8: MODEL EVALUATION ===
# Use the trained pipeline from Cell 7
y_pred_log = best_pipeline.predict(X_test)

# Convert back to original scale
y_pred = np.expm1(y_pred_log)
y_test_orig = np.expm1(y_test)  # this is to define y_test_orig

# Metrics
mae = mean_absolute_error(y_test_orig, y_pred)
r2 = r2_score(y_test_orig, y_pred)
avg_salary = y_test_orig.mean()
error_pct = (mae / avg_salary) * 100

print(f"Model Performance:")
print(f"- MAE: ${mae:,.2f}")
print(f"- R²: {r2:.4f}")
print(f"- Avg Salary Error: ±{error_pct:.1f}%")

Model Performance:
- MAE: $9,827.32
- R²: 0.8360
- Avg Salary Error: ±9.5%


In [None]:
# === CELL 9: Model Save ===
# Train on full data before saving
best_pipeline.fit(X, y)

# Save pipeline and calibration dictionary
joblib.dump({
    'model': best_pipeline,
    'feature_names': list(X.columns)
}, 'salary_predictor_final.pkl')

print("Model saved as 'salary_predictor_final.pkl'")

Model saved as 'salary_predictor_final.pkl'


In [None]:
# === CELL 10: Prediction ===
def predict_salary(education, experience, location, job_title, age, gender):
    # Calculate dynamic features
    experience_per_age = experience / (age - 18 + 1e-5)
    manager_director = 1 if job_title in ['Manager', 'Director'] else 0
    phd_experience = experience if education == 'PhD' else 0
    high_exp_low_edu = 1 if (experience > 15) and (education in ['High School', 'Bachelor']) else 0
    low_exp_high_edu = 1 if (experience < 5) and (education in ['Master', 'PhD']) else 0
    edu_job_interaction = f"{education}_{job_title}"
    experience_group = pd.cut([experience], bins=[0, 2, 5, 10, 20, 50],
                             labels=['Entry', 'Junior', 'Mid', 'Senior', 'Executive'])[0]

    # Create input dataframe with ALL required features
    input_data = pd.DataFrame([{
        'Education': education,
        'Location': location,
        'Job_Title': job_title,
        'Age': age,
        'Gender': gender,
        'Experience': experience,
        'Experience_Per_Age': experience_per_age,
        'Manager_Director': manager_director,
        'PhD_Experience': phd_experience,
        'Experience_Sq': experience**2,
        'Early_Career': 1 if experience <= 5 else 0,
        'Mid_Career': 1 if 5 < experience <= 15 else 0,
        'Late_Career': 1 if experience > 15 else 0,
        'HighExp_LowEdu': high_exp_low_edu,
        'LowExp_HighEdu': low_exp_high_edu,
        'Education_Job_Interaction': edu_job_interaction,
        'Experience_Group': experience_group
    }])

    # Load model
    model_data = joblib.load('salary_predictor_final.pkl')
    model = model_data['model']

    # Predict and return raw output
    log_pred = model.predict(input_data)
    return np.expm1(log_pred)[0]

# Test prediction
# print(f"Predicted Salary: ${predict_salary('PhD', 5, 'Urban', 'Analyst', 30, 'Female'):,.2f}")
print(f"Predicted Salary: ${predict_salary('High School', 8, 'Urban', 'Manager', 63, 'Male'):,.2f}")

Predicted Salary: $78,907.25


In [None]:
# === CELL 11: MODEL TESTING ===
test_cases = [
    # Row 1: PhD, 11 years, Director
    ('PhD', 11, 'Suburban', 'Director', 59, 'Male', 142591.255894305),
    # Row 4: PhD, 25 years, Analyst
    ('PhD', 25, 'Urban', 'Analyst', 26, 'Female', 132157.786174656),
    # Row 9: Bachelor, 20 years, Engineer
    ('Bachelor', 20, 'Urban', 'Engineer', 25, 'Female', 95945.2754285932)
]

for edu, exp, loc, job, age, gen, actual in test_cases:
    pred = predict_salary(edu, exp, loc, job, age, gen)
    error = pred - actual
    pct_error = (error / actual) * 100
    print(f"\n{edu} {job} ({exp}yrs):")
    print(f"- Actual: ${actual:,.2f}")
    print(f"- Predicted: ${pred:,.2f}")
    print(f"- Error: ${error:,.2f} ({pct_error:.1f}%)")


PhD Director (11yrs):
- Actual: $142,591.26
- Predicted: $142,601.59
- Error: $10.34 (0.0%)

PhD Analyst (25yrs):
- Actual: $132,157.79
- Predicted: $138,394.47
- Error: $6,236.69 (4.7%)

Bachelor Engineer (20yrs):
- Actual: $95,945.28
- Predicted: $99,446.70
- Error: $3,501.43 (3.6%)


In [None]:
# === CELL 12: MODEL TESTING ===
test_cases = [
    # Entry-level High School
    ('High School', 1, 'Rural', 'Analyst', 20, 'Male', 60345.92),

    # Mid-career Bachelor
    ('Bachelor', 7, 'Suburban', 'Engineer', 30, 'Female', 87235.18),

    # Experienced Master
    ('Master', 15, 'Urban', 'Manager', 40, 'Male', 124567.33),

    # Senior PhD Director
    ('PhD', 22, 'Urban', 'Director', 55, 'Female', 158920.45),

    # High School with high experience
    ('High School', 28, 'Rural', 'Manager', 50, 'Male', 92567.81),

    # Young PhD
    ('PhD', 3, 'Suburban', 'Analyst', 27, 'Female', 115430.67),

    # Late-career female engineer
    ('Bachelor', 30, 'Urban', 'Engineer', 60, 'Female', 102345.89),

    # Suburban Director
    ('Master', 18, 'Suburban', 'Director', 45, 'Male', 142367.54),

    # Rural Analyst
    ('Bachelor', 5, 'Rural', 'Analyst', 28, 'Female', 71234.56),

    # Gender comparison - same role
    ('PhD', 10, 'Urban', 'Engineer', 35, 'Male', 132450.78),
    ('PhD', 10, 'Urban', 'Engineer', 35, 'Female', 128760.32),

    # Location comparison - same role
    ('Master', 12, 'Urban', 'Manager', 42, 'Male', 118765.43),
    ('Master', 12, 'Suburban', 'Manager', 42, 'Male', 114320.67),
    ('Master', 12, 'Rural', 'Manager', 42, 'Male', 105670.89),

    # Extreme age-experience combination
    ('Bachelor', 40, 'Urban', 'Director', 62, 'Male', 148765.34)
]

for i, (edu, exp, loc, job, age, gen, actual) in enumerate(test_cases):
    pred = predict_salary(edu, exp, loc, job, age, gen)
    error = pred - actual
    pct_error = (error / actual) * 100
    print(f"\nTest Case {i+1}: {edu} {job} ({exp}yrs, {loc}, {age}yo {gen})")
    print(f"- Actual: ${actual:,.2f}")
    print(f"- Predicted: ${pred:,.2f}")
    print(f"- Error: ${error:,.2f} ({pct_error:.1f}%)")
    print(f"- Assessment: {'Excellent' if abs(pct_error) < 5 else 'Good' if abs(pct_error) < 10 else 'Acceptable'}")


Test Case 1: High School Analyst (1yrs, Rural, 20yo Male)
- Actual: $60,345.92
- Predicted: $56,345.62
- Error: $-4,000.30 (-6.6%)
- Assessment: Good

Test Case 2: Bachelor Engineer (7yrs, Suburban, 30yo Female)
- Actual: $87,235.18
- Predicted: $77,872.89
- Error: $-9,362.29 (-10.7%)
- Assessment: Acceptable

Test Case 3: Master Manager (15yrs, Urban, 40yo Male)
- Actual: $124,567.33
- Predicted: $130,128.35
- Error: $5,561.02 (4.5%)
- Assessment: Excellent

Test Case 4: PhD Director (22yrs, Urban, 55yo Female)
- Actual: $158,920.45
- Predicted: $157,501.78
- Error: $-1,418.67 (-0.9%)
- Assessment: Excellent

Test Case 5: High School Manager (28yrs, Rural, 50yo Male)
- Actual: $92,567.81
- Predicted: $90,536.45
- Error: $-2,031.36 (-2.2%)
- Assessment: Excellent

Test Case 6: PhD Analyst (3yrs, Suburban, 27yo Female)
- Actual: $115,430.67
- Predicted: $113,953.41
- Error: $-1,477.27 (-1.3%)
- Assessment: Excellent

Test Case 7: Bachelor Engineer (30yrs, Urban, 60yo Female)
- Actual: 

In [None]:
# === CELL 13: MODEL TESTING ===
# Critical test cases
critical_cases = [
    ('PhD', 11, 'Suburban', 'Director', 59, 'Male', 142591.26),
    ('Bachelor', 20, 'Urban', 'Engineer', 25, 'Female', 95945.28),
    ('High School', 28, 'Rural', 'Manager', 50, 'Male', 92567.81)
]

for case in critical_cases:
    edu, exp, loc, job, age, gen, actual = case
    pred = predict_salary(edu, exp, loc, job, age, gen)
    error_pct = (pred - actual) / actual * 100
    assert abs(error_pct) < 15, f"High error {error_pct:.1f}% for {case}"
    print(f"Case passed: {edu} {job} | Error: {error_pct:.1f}%")

Case passed: PhD Director | Error: 0.0%
Case passed: Bachelor Engineer | Error: 3.6%
Case passed: High School Manager | Error: -2.2%


In [None]:
# Now from this Cell new unique Features was added we don't need to train our model here
# === NEW CELL 1: Install Additional Libraries ===
!pip install shap lime fairlearn evidently plotly streamlit_shap langchain openai

In [None]:
# === NEW CELL 2: Import New Libraries ===
import shap
import lime
import lime.lime_tabular
from fairlearn.metrics import MetricFrame
from fairlearn.metrics import (
    make_derived_metric,
    count,
    selection_rate,
    equalized_odds_difference,
    demographic_parity_difference
)
from fairlearn.reductions import ExponentiatedGradient, DemographicParity, EqualizedOdds
from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import *
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset,RegressionPreset
import plotly.express as px
import requests
import json
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
import numpy as np
import pandas as pd
import joblib


In [None]:
# === NEW CELL 3: Explainable AI (XAI) Integration ===
def explain_prediction(pipeline, input_data, feature_names, method='shap'):
    """
    Explain model prediction using SHAP or LIME
    """
    # Get preprocessor and model from pipeline
    preprocessor = pipeline.named_steps['preprocess']
    model = pipeline.named_steps['model']

    # Transform input data
    processed_data = preprocessor.transform(input_data)

    if method == 'shap':
        # Create SHAP explainer
        explainer = shap.TreeExplainer(model)

        # Calculate SHAP values
        shap_values = explainer.shap_values(processed_data)

        # Generate force plot
        force_plot = shap.force_plot(
            explainer.expected_value,
            shap_values[0],
            feature_names=feature_names,
            matplotlib=True
        )

        # Generate summary plot
        shap.summary_plot(shap_values, processed_data, feature_names=feature_names)

        return force_plot, shap_values

    elif method == 'lime':
        # Create LIME explainer
        explainer = lime.lime_tabular.LimeTabularExplainer(
            training_data=preprocessor.transform(X_train).toarray(),
            feature_names=feature_names,
            mode='regression'
        )

        # Generate explanation
        exp = explainer.explain_instance(
            processed_data.toarray()[0],
            model.predict,
            num_features=len(feature_names)
        )

        return exp.as_list(), exp.show_in_notebook(show_table=True)

    return None

In [None]:
# === NEW CELL 4: Bias Detection and Mitigation ===
from fairlearn.metrics import MetricFrame
from sklearn.metrics import mean_absolute_error, r2_score
import plotly.express as px
import pandas as pd
import numpy as np

def detect_bias(model, X, y, sensitive_features):
    """
    Detect regression bias by evaluating error metrics across sensitive groups.
    """
    y_pred = model.predict(X)

    # Ensure 1D series for sensitive features
    if isinstance(sensitive_features, pd.DataFrame):
        sensitive_features = sensitive_features.squeeze()
    elif isinstance(sensitive_features, np.ndarray) and sensitive_features.ndim > 1:
        sensitive_features = sensitive_features.ravel()
    sensitive_series = pd.Series(sensitive_features, name="Group")

    # Define regression-safe metrics
    metrics = {
        'mae': mean_absolute_error,
        'r2': r2_score
    }

    metric_frame = MetricFrame(
        metrics=metrics,
        y_true=y,
        y_pred=y_pred,
        sensitive_features=sensitive_series
    )

    print("Bias Detection (Regression - Performance by Group):")
    print(metric_frame.by_group)

    # Prepare DataFrame for Plotly
    bias_df = metric_frame.by_group.reset_index()
    fig = px.bar(
        bias_df,
        x='Group',
        y='mae',
        color='Group',
        title='MAE by Gender Group'
    )
    fig.show()

    return metric_frame.by_group



def mitigate_bias(model, X_train, y_train, sensitive_features):
    """
    Apply bias mitigation using Exponentiated Gradient
    """
    # Define constraint
    constraint = EqualizedOdds()

    # Define mitigator
    mitigator = ExponentiatedGradient(
        estimator=model,
        constraints=constraint,
        sample_weight_name='sample_weight' if hasattr(model, 'sample_weight') else None
    )

    # Fit mitigator
    mitigator.fit(
        X_train,
        y_train,
        sensitive_features=sensitive_features
    )

    return mitigator

In [None]:
# === NEW CELL 5: Market Benchmarking API Integration (Enhanced) ===
import requests
import pandas as pd
from functools import lru_cache  # For caching API responses

# Load cost of living data with error handling
try:
    cost_of_living_df = pd.read_csv("/content/Cost_of_living_index.csv") # I am using Cost of Living Index csv file from Kaggle
    print("Successfully loaded cost of living data")
except Exception as e:
    print(f"Error loading cost data: {e}")
    cost_of_living_df = pd.DataFrame()

@lru_cache(maxsize=100)  # Cache up to 100 locations
def get_cost_of_living(location):
    """
    Enhanced version with:
    - Fuzzy matching for city names
    - Fallback to similar cities
    - Caching for performance
    """
    if cost_of_living_df.empty:
        return {"error": "Cost of living data not available"}

    # Try exact match first
    exact_match = cost_of_living_df[
        cost_of_living_df["City"].str.lower() == location.lower()
    ]

    if not exact_match.empty:
        return _format_cost_data(exact_match.iloc[0])

    # Try partial match if exact fails
    partial_match = cost_of_living_df[
        cost_of_living_df["City"].str.lower().str.contains(location.lower())
    ]

    if not partial_match.empty:
        return _format_cost_data(partial_match.iloc[0])

    return {"error": f"No cost data found for {location}"}

def _format_cost_data(row):
    """Helper to standardize cost data format"""
    return {
        "city": row["City"],
        "indices": {
            "cost_of_living": row.get("Cost of Living Index", 0),
            "rent": row.get("Rent Index", 0),
            "groceries": row.get("Groceries Index", 0),
            "restaurants": row.get("Restaurant Price Index", 0),
            "purchasing_power": row.get("Local Purchasing Power Index", 0)
        },
        "source": "Numbeo dataset"
    }

def get_salary_benchmark(job_title, experience, location, api_key):
    """
    Enhanced with:
    - Better error handling
    - Request timeout
    - Parameter validation
    """
    if not api_key:
        return {"error": "API key not configured"}


    url = "https://job-salary-data.p.rapidapi.com/job-salary"

    params = {
        "job_title": job_title[:50],  # Truncate to prevent API errors
        "location": location[:50],
        "location_type": "ANY",
        "years_of_experience": "ALL"  # Clamp 0-50
    }

    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "job-salary-data.p.rapidapi.com"
    }

    try:
        response = requests.get(
            url,
            headers=headers,
            params=params,
            timeout=10  # 10 second timeout
        )

        response.raise_for_status()  # Raises exception for 4XX/5XX

        data = response.json()

        if not data.get("data"):
            return {
                "error": "No salary data available",
                "details": data.get("message", "Unknown error")
            }

        salary = data["data"][0]["median_salary"]
        currency = data["data"][0].get("salary_currency", "USD")

        return {
            "job_title": job_title,
            "experience": experience,
            "location": location,
            "benchmark_salary": salary,
            "percentiles": {
                "25th": data["data"][0].get("min_salary", salary * 0.8),
                "median": salary,
                "75th": data["data"][0].get("max_salary", salary * 1.2)
            },
            "currency": currency,
            "source": "Job Salary Data API"
        }


    except requests.exceptions.RequestException as e:
        return {
            "error": "API request failed",
            "details": str(e)
        }

# Test functions
def test_cost_of_living():
    print("Testing cost of living:")
    print(get_cost_of_living("New York"))
    print(get_cost_of_living("London"))
    print(get_cost_of_living("Invalid City"))

def test_salary_benchmark(api_key):
    print("\nTesting salary benchmark:")
    print(get_salary_benchmark("Data Scientist", 5, "San Francisco", api_key))
    print(get_salary_benchmark("Software Engineer", 2, "Austin", api_key))

# Uncomment to those test function to check are those working or not
# test_cost_of_living()
# test_salary_benchmark("your_api_key")  # mainly here I am using Job Salary API from Rapi API for salary_benchmark

Successfully loaded cost of living data
Testing cost of living:
{'city': 'New York, NY, United States', 'indices': {'cost_of_living': np.float64(100.0), 'rent': np.float64(100.0), 'groceries': np.float64(100.0), 'restaurants': np.float64(100.0), 'purchasing_power': np.float64(100.0)}, 'source': 'Numbeo dataset'}
{'city': 'London, United Kingdom', 'indices': {'cost_of_living': np.float64(81.31), 'rent': np.float64(70.3), 'groceries': np.float64(57.74), 'restaurants': np.float64(91.85), 'purchasing_power': np.float64(100.22)}, 'source': 'Numbeo dataset'}
{'error': 'No cost data found for Invalid City'}

Testing salary benchmark:
{'job_title': 'Data Scientist', 'experience': 5, 'location': 'San Francisco', 'benchmark_salary': 207572.24, 'percentiles': {'25th': 166310.21, 'median': 207572.24, '75th': 263961.99}, 'currency': 'USD', 'source': 'Job Salary Data API'}
{'job_title': 'Software Engineer', 'experience': 2, 'location': 'Austin', 'benchmark_salary': 142808.65, 'percentiles': {'25th':

In [None]:
# === NEW CELL 6: new enhanced salary prediction function ===
def enhanced_predict_salary(education, experience, location, job_title, age, gender, explain=False):
    # Create input dataframe
    input_data = pd.DataFrame([{
        'Education': education,
        'Location': location,
        'Job_Title': job_title,
        'Age': age,
        'Gender': gender,
        'Experience': experience,
        'Experience_Per_Age': experience / (age - 18 + 1e-5),
        'Manager_Director': 1 if job_title in ['Manager', 'Director'] else 0,
        'PhD_Experience': experience if education == 'PhD' else 0,
        'Experience_Sq': experience**2,
        'Early_Career': 1 if experience <= 5 else 0,
        'Mid_Career': 1 if 5 < experience <= 15 else 0,
        'Late_Career': 1 if experience > 15 else 0,
        'HighExp_LowEdu': 1 if (experience > 15) and (education in ['High School', 'Bachelor']) else 0,
        'LowExp_HighEdu': 1 if (experience < 5) and (education in ['Master', 'PhD']) else 0,
        'Education_Job_Interaction': f"{education}_{job_title}",
        'Experience_Group': pd.cut([experience], bins=[0, 2, 5, 10, 20, 50],
                                 labels=['Entry', 'Junior', 'Mid', 'Senior', 'Executive'])[0]
    }])

    # Load model pipeline and features
    model_data = joblib.load('salary_predictor_final.pkl')
    model_pipeline = model_data['model']
    feature_names = model_data['feature_names']

    # Predict log salary
    log_pred = model_pipeline.predict(input_data)
    prediction = np.expm1(log_pred)[0]

    # Get benchmark (pass your API key here)
    benchmark = get_salary_benchmark(job_title, experience, location, api_key="your_api_key")

    explanation = None
    if explain:
        import shap
        # Extract preprocessor and model from pipeline
        preprocessor = model_pipeline.named_steps['preprocess']
        model = model_pipeline.named_steps['model']

        # Transform input data using preprocessor
        X_transformed = preprocessor.transform(input_data)

        # Create explainer for the model
        explainer = shap.Explainer(model)

        # Get shap values for transformed input
        shap_values = explainer(X_transformed)

        # Optionally plot or return the values
        # shap.force_plot(explainer.expected_value, shap_values.values[0,:], feature_names=feature_names)
        explanation = shap_values

    return {
        "predicted_salary": prediction,
        "market_benchmark": benchmark,
        "explanation": explanation
    }


In [None]:
# === NEW CELL 7: What-If Analysis Function ===
def what_if_analysis(base_input, feature_to_adjust, adjustment_range):
    """
    Perform what-if analysis by adjusting a specific feature
    """
    results = []
    base_prediction = enhanced_predict_salary(**base_input)["predicted_salary"]

    for value in adjustment_range:
        modified_input = base_input.copy()
        modified_input[feature_to_adjust] = value

        # Recalculate dynamic features
        if feature_to_adjust == 'Experience':
            modified_input['Experience_Per_Age'] = value / (modified_input['Age'] - 18 + 1e-5)
            modified_input['Experience_Sq'] = value**2
            modified_input['Early_Career'] = 1 if value <= 5 else 0
            modified_input['Mid_Career'] = 1 if 5 < value <= 15 else 0
            modified_input['Late_Career'] = 1 if value > 15 else 0
            modified_input['HighExp_LowEdu'] = 1 if (value > 15) and (modified_input['Education'] in ['High School', 'Bachelor']) else 0
            modified_input['LowExp_HighEdu'] = 1 if (value < 5) and (modified_input['Education'] in ['Master', 'PhD']) else 0
            modified_input['Experience_Group'] = pd.cut([value], bins=[0, 2, 5, 10, 20, 50],
                                                      labels=['Entry', 'Junior', 'Mid', 'Senior', 'Executive'])[0]

        prediction = enhanced_predict_salary(**modified_input)["predicted_salary"]
        results.append({
            feature_to_adjust: value,
            "predicted_salary": prediction,
            "change_from_base": prediction - base_prediction
        })

    return pd.DataFrame(results)

In [None]:
# === NEW CELL 8: Career Pathing and Skill Gap Analysis ===
SKILLS_DATASET = {
    "Data Scientist": {
        "required_skills": ["Python", "Machine Learning", "Statistics", "Data Visualization"],
        "salary_premiums": {"Python": 0.15, "Machine Learning": 0.20, "Statistics": 0.10, "Data Visualization": 0.08}
    },
    "Manager": {
        "required_skills": ["Leadership", "Project Management", "Strategic Planning", "Communication"],
        "salary_premiums": {"Leadership": 0.18, "Project Management": 0.12, "Strategic Planning": 0.15, "Communication": 0.10}
    },
    "Director": {
        "required_skills": ["Executive Leadership", "Budget Management", "Strategic Vision", "Cross-functional Collaboration"],
        "salary_premiums": {"Executive Leadership": 0.25, "Budget Management": 0.15, "Strategic Vision": 0.20, "Cross-functional Collaboration": 0.12}
    },
    "Analyst": {
        "required_skills": ["SQL", "Excel", "Data Analysis", "Reporting"],
        "salary_premiums": {"SQL": 0.12, "Excel": 0.08, "Data Analysis": 0.15, "Reporting": 0.10}
    },
    "Engineer": {
        "required_skills": ["Software Development", "System Design", "Cloud Computing", "Debugging"],
        "salary_premiums": {"Software Development": 0.18, "System Design": 0.15, "Cloud Computing": 0.20, "Debugging": 0.12}
    }
}

def analyze_skill_gap(current_job, target_job, current_skills):
    """
    Analyze skill gap between current position and target position
    """
    current_job_data = SKILLS_DATASET.get(current_job, {})
    target_job_data = SKILLS_DATASET.get(target_job, {})

    if not current_job_data or not target_job_data:
        return {"error": "Invalid job titles provided"}

    # Identify missing skills
    missing_skills = [skill for skill in target_job_data["required_skills"]
                     if skill not in current_skills]

    # Calculate potential salary increase
    potential_premium = sum(target_job_data["salary_premiums"].get(skill, 0)
                      for skill in missing_skills)

    # Recommend learning resources
    learning_resources = {
        "Python": ["Coursera: Python for Everybody", "Udacity: Data Scientist Nanodegree"],
        "Machine Learning": ["Coursera: Machine Learning by Andrew Ng", "Fast.ai Practical Deep Learning"],
        # You can add more things here but later in Streamlit app you can see I used my OpenAI AI key to generate learning resources
    }

    recommendations = []
    for skill in missing_skills:
        recommendations.append({
            "skill": skill,
            "salary_premium": target_job_data["salary_premiums"].get(skill, 0),
            "resources": learning_resources.get(skill, ["No specific resources found"])
        })

    return {
        "current_job": current_job,
        "target_job": target_job,
        "missing_skills_count": len(missing_skills),
        "potential_salary_increase_pct": potential_premium * 100,
        "recommendations": recommendations
    }

In [None]:
# === NEW CELL 9: Career Path Suggestion with OpenAI API ===
from openai import OpenAI

# Initialize OpenAI client with your configuration
client = OpenAI(
    api_key="your_api_key",  # Replace with your actual OpenAI API key
    base_url="https://api.chatanywhere.tech/v1"
)

def suggest_career_path(current_position, experience, education, interests):
    """
    Suggest career paths using OpenAI's GPT model
    """
    # Construct the prompt
    prompt = f"""
    As a career advisor with expertise in the tech and business industries,
    suggest 3 potential career paths for someone with:
    - Current position: {current_position}
    - Years of experience: {experience}
    - Education: {education}
    - Interests: {interests}

    For each path, include:
    1. Job title
    2. Required skills/certifications
    3. Typical salary range
    4. Growth potential
    5. Steps to transition

    Format the response in clear English without markdown formatting.
    """

    # Create the messages structure
    messages = [{
        'role': 'user',
        'content': prompt
    }]

    # Get response from OpenAI (non-streaming)
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0.7
    )

    return completion.choices[0].message.content

# Test function
def test_career_suggestion():
    """Test career path suggestion"""
    suggestion = suggest_career_path(
        current_position="Data Analyst",
        experience=3,
        education="Bachelor's in Computer Science",
        interests="Machine learning, data visualization, business intelligence"
    )
    print("Career Path Suggestions:")
    print(suggestion)

# Uncomment to test this function (make sure you have valid API key)
# test_career_suggestion()

Career Path Suggestions:
1. Data Scientist
   - Required skills/certifications: Advanced knowledge of machine learning algorithms, proficiency in data visualization tools, understanding of business intelligence concepts
   - Typical salary range: $80,000 - $120,000 per year
   - Growth potential: High, as demand for data scientists continues to rise in various industries
   - Steps to transition: Take online courses or attend workshops to deepen knowledge in machine learning, build a portfolio showcasing data analysis projects, network with professionals in the field

2. Business Intelligence Analyst
   - Required skills/certifications: Proficiency in data visualization tools (such as Tableau or Power BI), understanding of business processes, ability to translate data into actionable insights
   - Typical salary range: $60,000 - $90,000 per year
   - Growth potential: Moderate, with opportunities to advance into managerial roles or specialize in specific industries
   - Steps to transi

In [None]:
# === NEW CELL 10: Model Monitoring with Evidently ===
def monitor_model_performance(current_data, reference_data):
    """
    Monitor model performance and data drift using Evidently
    """
    # Define column mapping
    column_mapping = ColumnMapping()
    column_mapping.target = 'Salary'
    column_mapping.prediction = 'prediction'
    column_mapping.numerical_features = numerical_features
    column_mapping.categorical_features = categorical_features

    # Create and run report
    report = Report(metrics=[
        DataDriftPreset(),
        TargetDriftPreset(),
        RegressionPreset()
    ])

    report.run(
        reference_data=reference_data,
        current_data=current_data,
        column_mapping=column_mapping
    )

    return report

In [None]:
# === NEW CELL 11: Feedback Loop Implementation ===
# Actually I didn't complete this yet
def collect_feedback(prediction_id, user_id, rating, comments=None):
    """
    Collect user feedback about predictions
    """
    # In production, this would save to a database
    feedback = {
        "prediction_id": prediction_id,
        "user_id": user_id,
        "rating": rating,  # 1-5 scale
        "comments": comments,
        "timestamp": pd.Timestamp.now()
    }

    print(f"Feedback received: {feedback}")
    return feedback

def retrain_model_with_feedback(feedback_data):
    """
    Retrain model incorporating user feedback
    """
    # This would involve:
    # 1. Loading the current model
    # 2. Adding new data from feedback
    # 3. Retraining with expanded dataset
    # 4. Validating performance
    # 5. Deploying updated model

    print("Retraining model with new feedback data...")
    # Placeholder implementation
    return best_pipeline

In [None]:
# === NEW CELL 12: More Engineering Features ===
def add_engineered_features(df):
    df = df.copy()

    df['Experience_Per_Age'] = df['Experience'] / (df['Age'] - 18 + 1e-5)
    df['Manager_Director'] = df['Job_Title'].apply(lambda x: 1 if x in ['Manager', 'Director'] else 0)
    df['PhD_Experience'] = df.apply(lambda row: row['Experience'] if row['Education'] == 'PhD' else 0, axis=1)
    df['Experience_Sq'] = df['Experience'] ** 2
    df['Early_Career'] = (df['Experience'] <= 5).astype(int)
    df['Mid_Career'] = ((df['Experience'] > 5) & (df['Experience'] <= 15)).astype(int)
    df['Late_Career'] = (df['Experience'] > 15).astype(int)
    df['HighExp_LowEdu'] = ((df['Experience'] > 15) & (df['Education'].isin(['High School', 'Bachelor']))).astype(int)
    df['LowExp_HighEdu'] = ((df['Experience'] < 5) & (df['Education'].isin(['Master', 'PhD']))).astype(int)
    df['Education_Job_Interaction'] = df['Education'] + '_' + df['Job_Title']
    df['Experience_Group'] = pd.cut(df['Experience'], bins=[0, 2, 5, 10, 20, 50],
                                    labels=['Entry', 'Junior', 'Mid', 'Senior', 'Executive'])
    return df

In [None]:
# === NEW CELL 13: Demo of Enhanced Features ===
import numpy as np
import pandas as pd
import joblib

# Test XAI explanation
print("\n=== Testing XAI Explanation ===")
test_case = {
    'education': 'High School',
    'experience': 8,
    'location': 'Urban',
    'job_title': 'Manager',
    'age': 63,
    'gender': 'Male'
}
result = enhanced_predict_salary(explain=True, **test_case)
print(f"Predicted Salary: ${result['predicted_salary']:,.2f}")

# Test What-If Analysis
print("\n=== Testing What-If Analysis ===")
what_if_df = what_if_analysis(
    base_input=test_case,
    feature_to_adjust='experience',
    adjustment_range=range(1, 16, 2)  # Experience from 1 to 15 years in steps of 2
)
print(what_if_df)

# Test Skill Gap Analysis
print("\n=== Testing Skill Gap Analysis ===")
# skill_gap = analyze_skill_gap(
#     current_job='Analyst',
#     target_job='Data Scientist',
#     current_skills=['SQL', 'Excel', 'Data Analysis']
# )
skill_gap = analyze_skill_gap(
    current_job=test_case['job_title'],
    target_job='Senior Manager',
    current_skills=['Team Management', 'Project Planning', 'Leadership']
)
print(skill_gap)

# Test Alternative Career Path Suggestion (requires valid API key)
print("\n=== Testing Career Path Suggestion ===")
# career_path = suggest_career_path(
#     current_position="Data Analyst",
#     experience=3,
#     education="Bachelor's in Computer Science",
#     interests="Machine learning, data visualization, business intelligence"
# )
career_path = suggest_career_path(
    current_position=test_case['job_title'],
    experience=test_case['experience'],
    education=test_case['education'],
    interests="leadership, strategy, team development, business growth"
)
print(career_path)

# Test Bias Detection
print("\n=== Testing Bias Detection ===")
# Load full dataset for bias analysis
full_data = pd.read_csv("/content/salary_prediction_data.csv")
X_full = full_data.drop("Salary", axis=1)
y_full = np.log1p(full_data["Salary"])

# Preprocess data
X_full_enriched = add_engineered_features(X_full)  # Add engineered features
X_processed = best_pipeline.named_steps['preprocess'].transform(X_full_enriched)  # Now transform

# Get predictions
y_pred = best_pipeline.predict(X_full_enriched)

# Run updated bias detection
bias_results = detect_bias(
    model=best_pipeline,
    X=X_full_enriched,
    y=np.expm1(y_full),
    sensitive_features=X_full_enriched['Gender']
)

# Apply bias mitigation if needed
mae_diff = bias_results['mae'].diff().iloc[-1]
mae_diff = bias_results['mae'].diff().iloc[-1]
if abs(mae_diff) > 5000:  # Threshold for significant bias
    print("\n🔧 Applying bias mitigation...")

    # 1. Mitigate on preprocessed data
    mitigated_model = mitigate_bias(
        best_pipeline.named_steps['model'],
        X_train=X_processed,
        y_train=y_full,
        sensitive_features=X_full_enriched['Gender']
    )

    # 2. Create new fair pipeline (don't modify original)
    fair_pipeline = Pipeline([
        ('preprocess', best_pipeline.named_steps['preprocess']),
        ('model', mitigated_model)
    ])

    # 3. Save the fair model separately
    joblib.dump({
        'model': fair_pipeline,
        'feature_names': list(X_full.columns)
    }, 'fair_salary_predictor.pkl')

    print("✅ Saved fair model as 'fair_salary_predictor.pkl'")

    # 4. Compare performance
    y_pred_fair = fair_pipeline.predict(X_full_enriched)
    mae_fair = mean_absolute_error(np.expm1(y_full), np.expm1(y_pred_fair))

    print(f"\nBias Mitigation Results:")
    print(f"- Original MAE: ${bias_results['mae'].mean():,.2f}")
    print(f"- Fair Model MAE: ${mae_fair:,.2f}")
    print(f"- MAE Difference: ${mae_fair - bias_results['mae'].mean():+,.2f}")
else:
    print("\n✅ No significant bias detected (MAE difference < $5,000)")


=== Testing XAI Explanation ===
Predicted Salary: $78,907.25

=== Testing What-If Analysis ===
   experience  predicted_salary  change_from_base
0           1      68039.093750     -10868.156250
1           3      67922.781250     -10984.468750
2           5      67288.164062     -11619.085938
3           7      73529.515625      -5377.734375
4           9      74254.859375      -4652.390625
5          11      59862.406250     -19044.843750
6          13      68086.023438     -10821.226562
7          15      81826.632812       2919.382812

=== Testing Skill Gap Analysis ===
{'current_job': 'Analyst', 'target_job': 'Data Scientist', 'missing_skills_count': 4, 'potential_salary_increase_pct': 52.99999999999999, 'recommendations': [{'skill': 'Python', 'salary_premium': 0.15, 'resources': ['Coursera: Python for Everybody', 'Udacity: Data Scientist Nanodegree']}, {'skill': 'Machine Learning', 'salary_premium': 0.2, 'resources': ['Coursera: Machine Learning by Andrew Ng', 'Fast.ai Practical


✅ No significant bias detected (MAE difference < $5,000)


In [None]:
# === NEW CELL 14: More Engineering Features ===
# This is just for my pipeline strucure
print(best_pipeline)  # What is the pipeline structure?
print(hasattr(best_pipeline, 'named_steps'))  # Does it have named steps?
if hasattr(best_pipeline, 'named_steps'):
    print(best_pipeline.named_steps.keys())  # What steps does it contain?


Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(categories=[['High '
                                                                             'School',
                                                                             'Bachelor',
                                                                             'Master',
                                                                             'PhD'],
                                                                            ['Urban',
                                                                             'Suburban',
                                                                             'Rural'],
                                                                            ['Manager',
                                                                             'Director',
                                        

In [None]:
# === NEW CELL 15: Install req libraries for Streamlit application ===
!pip install streamlit plotly pandas numpy joblib scikit-learn shap lime fairlearn evidently streamlit_shap langchain openai

In [None]:
# === NEW CELL 16: This is my complete Streamlit app or you can find some few changes in my github mainly for deployment ===
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import io
import base64
from datetime import datetime
from fairlearn.metrics import MetricFrame
from sklearn.metrics import mean_absolute_error, r2_score
import requests
import shap
from openai import OpenAI
import json

client = OpenAI(
    api_key="", # give your OpenAI API
    base_url="https://api.chatanywhere.tech/v1"
)


# Page configuration
st.set_page_config(
    page_title="💰 AI Salary Predictor Pro",
    page_icon="💰",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS for better styling
st.markdown("""
<style>
    .main-header {
        font-size: 3rem;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
    }
    .sub-header {
        font-size: 1.5rem;
        color: #2c3e50;
        margin-bottom: 1rem;
    }
    .metric-card {
        background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
        padding: 1rem;
        border-radius: 10px;
        color: white;
        text-align: center;
    }
    .success-box {
        padding: 1rem;
        background-color: #d4edda;
        border: 1px solid #c3e6cb;
        border-radius: 5px;
        color: #155724;
    }
    .warning-box {
        padding: 1rem;
        background-color: #fff3cd;
        border: 1px solid #ffeaa7;
        border-radius: 5px;
        color: #856404;
    }
</style>
""", unsafe_allow_html=True)

def get_salary_benchmark(job_title, experience, location, api_key):
    """
    Enhanced with:
    - Better error handling
    - Request timeout
    - Parameter validation
    """
    if not api_key:
        return {"error": "API key not configured"}


    url = "https://job-salary-data.p.rapidapi.com/job-salary"

    params = {
        "job_title": job_title[:50],  # Truncate to prevent API errors
        "location": location[:50],
        "location_type": "ANY",
        "years_of_experience": "ALL"  # Clamp 0-50
    }

    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "job-salary-data.p.rapidapi.com"
    }

    try:
        response = requests.get(
            url,
            headers=headers,
            params=params,
            timeout=10  # 10 second timeout
        )

        response.raise_for_status()  # Raises exception for 4XX/5XX

        data = response.json()

        if not data.get("data"):
            return {
                "error": "No salary data available",
                "details": data.get("message", "Unknown error")
            }

        salary = data["data"][0]["median_salary"]
        currency = data["data"][0].get("salary_currency", "USD")

        return {
            "job_title": job_title,
            "experience": experience,
            "location": location,
            "benchmark_salary": salary,
            "percentiles": {
                "25th": data["data"][0].get("min_salary", salary * 0.8),
                "median": salary,
                "75th": data["data"][0].get("max_salary", salary * 1.2)
            },
            "currency": currency,
            "source": "Job Salary Data API"
        }


    except requests.exceptions.RequestException as e:
        return {
            "error": "API request failed",
            "details": str(e)
        }

# Enhanced ML Functions (previously imported)
def enhanced_predict_salary(education, experience, location, job_title, age, gender, explain=False):
    # Create input dataframe
    input_data = pd.DataFrame([{
        'Education': education,
        'Location': location,
        'Job_Title': job_title,
        'Age': age,
        'Gender': gender,
        'Experience': experience,
        'Experience_Per_Age': experience / (age - 18 + 1e-5),
        'Manager_Director': 1 if job_title in ['Manager', 'Director'] else 0,
        'PhD_Experience': experience if education == 'PhD' else 0,
        'Experience_Sq': experience**2,
        'Early_Career': 1 if experience <= 5 else 0,
        'Mid_Career': 1 if 5 < experience <= 15 else 0,
        'Late_Career': 1 if experience > 15 else 0,
        'HighExp_LowEdu': 1 if (experience > 15) and (education in ['High School', 'Bachelor']) else 0,
        'LowExp_HighEdu': 1 if (experience < 5) and (education in ['Master', 'PhD']) else 0,
        'Education_Job_Interaction': f"{education}_{job_title}",
        'Experience_Group': pd.cut([experience], bins=[0, 2, 5, 10, 20, 50],
                                 labels=['Entry', 'Junior', 'Mid', 'Senior', 'Executive'])[0]
    }])

    # Load model pipeline and features
    model_data = joblib.load('salary_predictor_final.pkl')
    model_pipeline = model_data['model']
    feature_names = model_data['feature_names']

    # Predict log salary
    log_pred = model_pipeline.predict(input_data)
    prediction = np.expm1(log_pred)[0]

    # Get benchmark (pass your API key here)
    benchmark = get_salary_benchmark(job_title, experience, location, api_key="your_api_key")

    explanation = None
    if explain:
        import shap
        # Extract preprocessor and model from pipeline
        preprocessor = model_pipeline.named_steps['preprocess']
        model = model_pipeline.named_steps['model']

        # Transform input data using preprocessor
        X_transformed = preprocessor.transform(input_data)

        # Create explainer for the model
        explainer = shap.Explainer(model)

        # Get shap values for transformed input
        shap_values = explainer(X_transformed)

        # Optionally plot or return the values
        # shap.force_plot(explainer.expected_value, shap_values.values[0,:], feature_names=feature_names)
        explanation = shap_values

    return {
        "predicted_salary": prediction,
        "market_benchmark": benchmark,
        "explanation": explanation
    }

def what_if_analysis(base_input, feature_to_adjust, adjustment_range):
    """
    Perform what-if analysis by adjusting a specific feature
    """
    results = []
    base_prediction = enhanced_predict_salary(**base_input)["predicted_salary"]

    for value in adjustment_range:
        modified_input = base_input.copy()
        modified_input[feature_to_adjust] = value

        # Recalculate dynamic features
        if feature_to_adjust == 'Experience':
            modified_input['Experience_Per_Age'] = value / (modified_input['Age'] - 18 + 1e-5)
            modified_input['Experience_Sq'] = value**2
            modified_input['Early_Career'] = 1 if value <= 5 else 0
            modified_input['Mid_Career'] = 1 if 5 < value <= 15 else 0
            modified_input['Late_Career'] = 1 if value > 15 else 0
            modified_input['HighExp_LowEdu'] = 1 if (value > 15) and (modified_input['Education'] in ['High School', 'Bachelor']) else 0
            modified_input['LowExp_HighEdu'] = 1 if (value < 5) and (modified_input['Education'] in ['Master', 'PhD']) else 0
            modified_input['Experience_Group'] = pd.cut([value], bins=[0, 2, 5, 10, 20, 50],
                                                      labels=['Entry', 'Junior', 'Mid', 'Senior', 'Executive'])[0]

        prediction = enhanced_predict_salary(**modified_input)["predicted_salary"]
        results.append({
            feature_to_adjust: value,
            "predicted_salary": prediction,
            "change_from_base": prediction - base_prediction
        })

    return pd.DataFrame(results)

def suggest_career_path(current_position, experience, education, interests):
    """
    Suggest career paths using OpenAI's GPT model
    """
    # Construct the prompt
    prompt = f"""
    As a career advisor with expertise in the tech and business industries,
    suggest 3 potential career paths for someone with:
    - Current position: {current_position}
    - Years of experience: {experience}
    - Education: {education}
    - Interests: {interests}

    For each path, include:
    1. Job title
    2. Required skills/certifications
    3. Typical salary range
    4. Growth potential
    5. Steps to transition

    Format the response in clear English without markdown formatting.
    """

    # Create the messages structure
    messages = [{
        'role': 'user',
        'content': prompt
    }]

    # Get response from OpenAI (non-streaming)
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0.7
    )

    return completion.choices[0].message.content

SKILLS_DATASET = {
    "Data Scientist": {
        "required_skills": ["Python", "Machine Learning", "Statistics", "Data Visualization"],
        "salary_premiums": {"Python": 0.15, "Machine Learning": 0.20, "Statistics": 0.10, "Data Visualization": 0.08}
    },
    "Manager": {
        "required_skills": ["Leadership", "Project Management", "Strategic Planning", "Communication"],
        "salary_premiums": {"Leadership": 0.18, "Project Management": 0.12, "Strategic Planning": 0.15, "Communication": 0.10}
    },
    "Director": {
        "required_skills": ["Executive Leadership", "Budget Management", "Strategic Vision", "Cross-functional Collaboration"],
        "salary_premiums": {"Executive Leadership": 0.25, "Budget Management": 0.15, "Strategic Vision": 0.20, "Cross-functional Collaboration": 0.12}
    },
    "Analyst": {
        "required_skills": ["SQL", "Excel", "Data Analysis", "Reporting"],
        "salary_premiums": {"SQL": 0.12, "Excel": 0.08, "Data Analysis": 0.15, "Reporting": 0.10}
    },
    "Engineer": {
        "required_skills": ["Software Development", "System Design", "Cloud Computing", "Debugging"],
        "salary_premiums": {"Software Development": 0.18, "System Design": 0.15, "Cloud Computing": 0.20, "Debugging": 0.12}
    },
    "Data Analyst": {
    "required_skills": ["SQL", "Excel", "Data Analysis", "Business Intelligence"],
    "salary_premiums": {"SQL": 0.12, "Excel": 0.08, "Data Analysis": 0.15, "Business Intelligence": 0.10}
    },
    "Developer": {
        "required_skills": ["Software Development", "Version Control", "Debugging", "Testing"],
        "salary_premiums": {"Software Development": 0.18, "Version Control": 0.10, "Debugging": 0.12, "Testing": 0.08}
    },
    "Senior Analyst": {
        "required_skills": ["SQL", "Data Visualization", "Advanced Analytics", "Communication"],
        "salary_premiums": {"SQL": 0.12, "Data Visualization": 0.10, "Advanced Analytics": 0.15, "Communication": 0.08}
    },
    "Senior Manager": {
        "required_skills": ["Leadership", "Strategic Planning", "Cross-functional Collaboration", "Budget Management"],
        "salary_premiums": {"Leadership": 0.18, "Strategic Planning": 0.15, "Cross-functional Collaboration": 0.12, "Budget Management": 0.10}
    },
    "ML Engineer": {
        "required_skills": ["Machine Learning", "Python", "Cloud Computing", "MLOps"],
        "salary_premiums": {"Machine Learning": 0.20, "Python": 0.15, "Cloud Computing": 0.20, "MLOps": 0.12}
    },
    "Principal Engineer": {
        "required_skills": ["System Architecture", "Leadership", "Cloud Computing", "Mentoring"],
        "salary_premiums": {"System Architecture": 0.20, "Leadership": 0.15, "Cloud Computing": 0.18, "Mentoring": 0.10}
    }
}

# def analyze_skill_gap(current_job, target_job, current_skills):
#     """
#     Analyze skill gap between current position and target position
#     """
#     current_job_data = SKILLS_DATASET.get(current_job, {})
#     target_job_data = SKILLS_DATASET.get(target_job, {})

#     if not current_job_data or not target_job_data:
#         return {"error": "Invalid job titles provided"}

#     # Identify missing skills
#     missing_skills = [skill for skill in target_job_data["required_skills"]
#                      if skill not in current_skills]

#     # Calculate potential salary increase
#     potential_premium = sum(target_job_data["salary_premiums"].get(skill, 0)
#                       for skill in missing_skills)

#     # Recommend learning resources
#     learning_resources = {
#         "Python": ["Coursera: Python for Everybody", "Udacity: Data Scientist Nanodegree"],
#         "Machine Learning": ["Coursera: Machine Learning by Andrew Ng", "Fast.ai Practical Deep Learning"],
#         "SQL": ["Mode Analytics SQL Tutorials", "DataCamp: SQL for Data Science"],
#         "Excel": ["Excel Exposure", "LinkedIn Learning: Excel Essential Training"],
#         "Business Intelligence": ["Udemy: Business Intelligence Concepts", "LinkedIn Learning: BI Tools"],
#         "Software Development": ["Codecademy: Learn to Code", "Udemy: Complete Software Developer Bootcamp"],
#         "Version Control": ["Udacity: Git & GitHub", "Coursera: Version Control with Git"],
#         "Testing": ["Udemy: Unit Testing in Python", "Pluralsight: Software Testing Fundamentals"],
#         "Advanced Analytics": ["Coursera: Advanced Business Analytics", "LinkedIn Learning: Advanced Analytics"],
#         "Communication": ["LinkedIn Learning: Communication Foundations"],
#         "Strategic Planning": ["LinkedIn Learning: Strategic Planning Foundations"],
#         "Cross-functional Collaboration": ["LinkedIn Learning: Collaboration Principles"],
#         "Budget Management": ["Coursera: Finance for Non-Finance Managers"],
#         "Cloud Computing": ["AWS Training", "Google Cloud Skills Boost"],
#         "MLOps": ["Coursera: MLOps Fundamentals", "Udacity: MLOps Nanodegree"],
#         "System Architecture": ["Coursera: Software Architecture", "Pluralsight: System Design"],
#         "Mentoring": ["LinkedIn Learning: Coaching and Mentoring"],
#         # Add resources for other skills...
#     }

#     recommendations = []
#     for skill in missing_skills:
#         recommendations.append({
#             "skill": skill,
#             "salary_premium": target_job_data["salary_premiums"].get(skill, 0),
#             "resources": learning_resources.get(skill, ["No specific resources found"])
#         })

#     return {
#         "current_job": current_job,
#         "target_job": target_job,
#         "missing_skills_count": len(missing_skills),
#         "potential_salary_increase_pct": potential_premium * 100,
#         "recommendations": recommendations
#     }


def analyze_skill_gap(current_job, target_job, current_skills):
    """
    Analyze skill gap using SKILLS_DATASET for skills + salary impact,
    but get all learning resources dynamically from GPT.
    """
    current_job_data = SKILLS_DATASET.get(current_job, {})
    target_job_data = SKILLS_DATASET.get(target_job, {})

    if not current_job_data or not target_job_data:
        return {"error": "Invalid job titles provided"}

    # Identify missing skills
    missing_skills = [
        skill for skill in target_job_data["required_skills"]
        if skill not in current_skills
    ]

    # Calculate potential salary increase
    potential_premium = sum(
        target_job_data["salary_premiums"].get(skill, 0)
        for skill in missing_skills
    )

    # === GPT prompt to get resources for all missing skills ===
    prompt = f"""
    Act like a professional career coach.
    For each of these skills: {missing_skills}
    suggest 2-3 up-to-date online courses in this JSON format only:
    {{
      "Skill Name": ["Platform: Course Name", "Platform: Course Name"]
    }}
    Use reputable platforms like Coursera, Udemy, edX, Udacity.
    ONLY output valid JSON.
    """

    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5
    )

    try:
        gpt_resources = json.loads(completion.choices[0].message.content)
    except json.JSONDecodeError:
        gpt_resources = {}

    # Build recommendations
    recommendations = []
    for skill in missing_skills:
        recommendations.append({
            "skill": skill,
            "salary_premium": target_job_data["salary_premiums"].get(skill, 0),
            "resources": gpt_resources.get(skill, ["No specific resources found"])
        })

    return {
        "current_job": current_job,
        "target_job": target_job,
        "missing_skills_count": len(missing_skills),
        "potential_salary_increase_pct": potential_premium * 100,
        "recommendations": recommendations
    }



def detect_bias(model, X, y, sensitive_features):
    """
    Detect regression bias by evaluating error metrics across sensitive groups.
    """
    y_pred = model.predict(X)

    # Ensure 1D series for sensitive features
    if isinstance(sensitive_features, pd.DataFrame):
        sensitive_features = sensitive_features.squeeze()
    elif isinstance(sensitive_features, np.ndarray) and sensitive_features.ndim > 1:
        sensitive_features = sensitive_features.ravel()
    sensitive_series = pd.Series(sensitive_features, name="Group")

    # Define regression-safe metrics
    metrics = {
        'mae': mean_absolute_error,
        'r2': r2_score
    }

    metric_frame = MetricFrame(
        metrics=metrics,
        y_true=y,
        y_pred=y_pred,
        sensitive_features=sensitive_series
    )

    print("Bias Detection (Regression - Performance by Group):")
    print(metric_frame.by_group)

    # Prepare DataFrame for Plotly
    bias_df = metric_frame.by_group.reset_index()
    fig = px.bar(
        bias_df,
        x='Group',
        y='mae',
        color='Group',
        title='MAE by Gender Group'
    )
    fig.show()

    return metric_frame.by_group

def add_engineered_features(df):
    df = df.copy()

    df['Experience_Per_Age'] = df['Experience'] / (df['Age'] - 18 + 1e-5)
    df['Manager_Director'] = df['Job_Title'].apply(lambda x: 1 if x in ['Manager', 'Director'] else 0)
    df['PhD_Experience'] = df.apply(lambda row: row['Experience'] if row['Education'] == 'PhD' else 0, axis=1)
    df['Experience_Sq'] = df['Experience'] ** 2
    df['Early_Career'] = (df['Experience'] <= 5).astype(int)
    df['Mid_Career'] = ((df['Experience'] > 5) & (df['Experience'] <= 15)).astype(int)
    df['Late_Career'] = (df['Experience'] > 15).astype(int)
    df['HighExp_LowEdu'] = ((df['Experience'] > 15) & (df['Education'].isin(['High School', 'Bachelor']))).astype(int)
    df['LowExp_HighEdu'] = ((df['Experience'] < 5) & (df['Education'].isin(['Master', 'PhD']))).astype(int)
    df['Education_Job_Interaction'] = df['Education'] + '_' + df['Job_Title']
    df['Experience_Group'] = pd.cut(df['Experience'], bins=[0, 2, 5, 10, 20, 50],
                                    labels=['Entry', 'Junior', 'Mid', 'Senior', 'Executive'])
    return df

# Load the trained model
@st.cache_resource
def load_model():
    try:
        model_data = joblib.load('salary_predictor_final.pkl')
        return model_data['model'], model_data['feature_names']
    except FileNotFoundError:
        st.warning("⚠️ Model file not found! Using mock predictions for demonstration.")
        return None, None

# Main app
def main():
    model, feature_names = load_model()

    # Header
    st.markdown('<h1 class="main-header">💰 AI Salary Predictor Pro</h1>', unsafe_allow_html=True)
    st.markdown("### 🤖 Advanced ML-powered salary prediction with XAI, bias detection & career insights")

    # Sidebar navigation
    st.sidebar.title("🧭 Navigation")
    page = st.sidebar.radio(
    "Choose a feature:",
    [
        "🏠 Home & Manual Prediction",
        "📊 Batch CSV Prediction",
        "🔍 What-If Analysis",
        "🎯 Skill Gap Analysis",
        "📈 Salary Insights Dashboard",
        "⚖️ Bias Detection",
        "ℹ️ About"
    ]
    )

    if page == "🏠 Home & Manual Prediction":
        show_manual_prediction(model)
    elif page == "📊 Batch CSV Prediction":
        show_batch_prediction(model)
    elif page == "🔍 What-If Analysis":
        show_what_if_analysis()
    elif page == "🎯 Skill Gap Analysis":
        show_skill_gap_analysis()
    elif page == "📈 Salary Insights Dashboard":
        show_salary_dashboard(model)
    elif page == "⚖️ Bias Detection":
        show_bias_detection(model)
    elif page == "ℹ️ About":
        show_about_page()

def show_manual_prediction(model):
    st.markdown('<h2 class="sub-header">🏠 Manual Salary Prediction</h2>', unsafe_allow_html=True)

    col1, col2 = st.columns([1, 1])

    with col1:
        st.markdown("#### 👤 Personal Information")

        age = st.slider("Age", min_value=18, max_value=70, value=30)
        gender = st.selectbox("Gender", ["Male", "Female"])
        education = st.selectbox("Education Level", [
            "High School", "Bachelor", "Master", "PhD"
        ])

        st.markdown("#### 💼 Professional Information")

        job_title = st.selectbox("Job Title", [
            "Analyst", "Manager", "Director", "Senior Manager",
            "Data Scientist", "Engineer", "Developer"
        ])
        experience = st.slider("Years of Experience", min_value=0, max_value=30, value=5)
        location = st.selectbox("Location", ["Urban", "Suburban", "Rural"])

    with col2:
        st.markdown("#### 🎯 Prediction Settings")

        show_explanation = st.checkbox("🔍 Show AI Explanation (XAI)", value=True)
        show_confidence = st.checkbox("📊 Show Confidence Interval", value=True)

        st.markdown("#### 🚀 Advanced Features")
        compare_scenarios = st.checkbox("⚡ Compare Multiple Scenarios")

        if st.button("💰 Predict Salary", type="primary", use_container_width=True):
            # Prepare input data
            input_data = {
                'education': education,
                'experience': experience,
                'location': location,
                'job_title': job_title,
                'age': age,
                'gender': gender
            }

            # Make prediction with explanation
            try:
                result = enhanced_predict_salary(explain=show_explanation, **input_data)

                # Display results
                st.markdown("---")
                st.markdown("### 🎉 Prediction Results")

                # Main prediction
                col1, col2, col3 = st.columns(3)

                with col1:
                    st.markdown(f"""
                    <div class="metric-card">
                        <h3>${result['predicted_salary']:,.0f}</h3>
                        <p>Predicted Salary</p>
                    </div>
                    """, unsafe_allow_html=True)

                with col2:
                    confidence_level = np.random.uniform(85, 95)  # Mock confidence
                    st.markdown(f"""
                    <div class="metric-card">
                        <h3>{confidence_level:.1f}%</h3>
                        <p>Confidence Level</p>
                    </div>
                    """, unsafe_allow_html=True)

                with col3:
                    market_position = "Above Average" if result['predicted_salary'] > 75000 else "Average"
                    st.markdown(f"""
                    <div class="metric-card">
                        <h3>{market_position}</h3>
                        <p>Market Position</p>
                    </div>
                    """, unsafe_allow_html=True)

                # XAI Explanation
                if show_explanation and 'explanation' in result:
                    st.markdown("#### 🔍 AI Explanation")
                    st.markdown("**Top factors influencing your salary:**")

                    # Create explanation chart
                    explanation_data = result['explanation']
                    shap_values = explanation_data
                    shap_values_arr = shap_values.values[0]
                    feature_names = shap_values.feature_names
                    fig = px.bar(
                        x=shap_values_arr,
                        y=feature_names,
                        orientation='h',
                        title="Feature Impact",
                        color=shap_values_arr,
                        color_continuous_scale="viridis"
                    )
                    fig.update_layout(height=400)
                    st.plotly_chart(fig, use_container_width=True)

                # Salary range with confidence
                if show_confidence:
                    st.markdown("#### 📊 Salary Range Estimation")
                    base_salary = result['predicted_salary']
                    lower_bound = base_salary * 0.85
                    upper_bound = base_salary * 1.15

                    col1, col2, col3 = st.columns(3)
                    with col1:
                        st.metric("Conservative Estimate", f"${lower_bound:,.0f}")
                    with col2:
                        st.metric("Most Likely", f"${base_salary:,.0f}")
                    with col3:
                        st.metric("Optimistic Estimate", f"${upper_bound:,.0f}")

            except Exception as e:
                st.error(f"❌ Prediction failed: {str(e)}")

def show_batch_prediction(model):
    st.markdown('<h2 class="sub-header">📊 Batch CSV Prediction</h2>', unsafe_allow_html=True)

    st.markdown("""
    Upload a CSV file with employee data to predict salaries for multiple people at once.

    **Required columns:** `Education`, `Experience`, `Location`, `Job_Title`, `Age`, `Gender`
    """)

    # Sample CSV download
    sample_data = pd.DataFrame({
        'Education': ['Bachelor', 'Master', 'PhD', 'Bachelor'],
        'Experience': [3, 7, 12, 5],
        'Location': ['Urban', 'Urban', 'Suburban', 'Rural'],
        'Job_Title': ['Analyst', 'Manager', 'Director', 'Engineer'],
        'Age': [25, 32, 45, 28],
        'Gender': ['Female', 'Male', 'Male', 'Female']
    })

    st.download_button(
        label="📥 Download Sample CSV",
        data=convert_df_to_csv(sample_data),
        file_name="sample_employees.csv",
        mime="text/csv"
    )

    # File uploader
    uploaded_file = st.file_uploader(
        "Choose a CSV file",
        type="csv",
        help="Upload a CSV file with employee data"
    )

    if uploaded_file is not None:
        try:
            # Read and preview data
            df = pd.read_csv(uploaded_file)

            st.markdown("### 👀 Data Preview")
            st.markdown(f"**Dataset shape:** {df.shape[0]} rows × {df.shape[1]} columns")

            # Show data preview
            st.dataframe(df.head(10), use_container_width=True)

            # Data validation
            required_columns = ['Education', 'Experience', 'Location', 'Job_Title', 'Age', 'Gender']
            missing_columns = [col for col in required_columns if col not in df.columns]

            if missing_columns:
                st.error(f"❌ Missing required columns: {', '.join(missing_columns)}")
                st.info("**Required columns:** " + ", ".join(required_columns))
            else:
                st.success("✅ All required columns found!")

                col1, col2 = st.columns(2)

                with col1:
                    if st.button("🚀 Predict All Salaries", type="primary"):
                        predict_batch_salaries(df, model)

                with col2:
                    show_advanced = st.checkbox("🔬 Show Advanced Analytics")

        except Exception as e:
            st.error(f"❌ Error reading file: {str(e)}")

def predict_batch_salaries(df, model):
    try:
        with st.spinner("🤖 Predicting salaries..."):
            # Prepare data for prediction
            # Rename columns to match training names
            df = df.rename(columns={'Experience': 'Experience', 'Job_Title': 'Job_Title'})

            # Add engineered features
            df_enriched = add_engineered_features(df)

            # Load pipeline
            model_data = joblib.load('salary_predictor_final.pkl')
            model_pipeline = model_data['model']

            # Predict for all rows
            log_preds = model_pipeline.predict(df_enriched)
            predictions = np.expm1(log_preds)

            # Add predictions to original DataFrame
            df_results = df.copy()
            df_results['Predicted_Salary'] = predictions
            df_results['Salary_Category'] = pd.cut(
                predictions,
                bins=[0, 50000, 75000, 100000, 150000, float('inf')],
                labels=['Entry Level', 'Mid Level', 'Senior', 'Executive', 'C-Suite']
            )

            st.markdown("### 🎉 Prediction Results")

            # Summary statistics
            col1, col2, col3, col4 = st.columns(4)

            with col1:
                avg_salary = np.mean(predictions)
                st.metric("Average Salary", f"${avg_salary:,.0f}")

            with col2:
                median_salary = np.median(predictions)
                st.metric("Median Salary", f"${median_salary:,.0f}")

            with col3:
                min_salary = np.min(predictions)
                st.metric("Minimum Salary", f"${min_salary:,.0f}")

            with col4:
                max_salary = np.max(predictions)
                st.metric("Maximum Salary", f"${max_salary:,.0f}")

            # Results preview
            st.markdown("### 📋 Results Preview")
            st.dataframe(df_results, use_container_width=True)

            # Visualization
            col1, col2 = st.columns(2)

            with col1:
                # Salary distribution
                fig1 = px.histogram(
                    df_results,
                    x='Predicted_Salary',
                    title="Salary Distribution",
                    nbins=20,
                    color_discrete_sequence=['#1f77b4']
                )
                st.plotly_chart(fig1, use_container_width=True)

            with col2:
                # Salary by category
                category_counts = df_results['Salary_Category'].value_counts()
                fig2 = px.pie(
                    values=category_counts.values,
                    names=category_counts.index,
                    title="Salary Categories Distribution"
                )
                st.plotly_chart(fig2, use_container_width=True)

            # Download results
            csv_download = convert_df_to_csv(df_results)
            st.download_button(
                label="📥 Download Results CSV",
                data=csv_download,
                file_name=f"salary_predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
                mime="text/csv",
                type="primary"
            )

    except Exception as e:
        st.error(f"❌ Batch prediction failed: {str(e)}")

def show_what_if_analysis():
    st.markdown('<h2 class="sub-header">🔍 What-If Analysis</h2>', unsafe_allow_html=True)

    st.markdown("""
    Explore how different factors impact salary predictions.
    Create a base profile and see how changing one factor affects the predicted salary.
    """)

    col1, col2 = st.columns([1, 1])

    with col1:
        st.markdown("#### 👤 Base Profile")

        base_profile = {
            'education': st.selectbox("Base Education", ["High School", "Bachelor", "Master", "PhD"], key="base_edu"),
            'experience': st.slider("Base Experience", 0, 30, 5, key="base_exp"),
            'location': st.selectbox("Base Location", ["Urban", "Suburban", "Rural"], key="base_loc"),
            'job_title': st.selectbox("Base Job Title", ["Analyst", "Manager", "Director", "Senior Manager"], key="base_job"),
            'age': st.slider("Base Age", 18, 70, 30, key="base_age"),
            'gender': st.selectbox("Base Gender", ["Male", "Female"], key="base_gender")
        }

    with col2:
        st.markdown("#### 🎛️ What-If Settings")

        feature_to_change = st.selectbox(
            "Feature to Analyze",
            ["experience", "education", "age", "job_title"]
        )

        if feature_to_change == "experience":
            range_values = st.slider(
                "Experience Range",
                min_value=0, max_value=30,
                value=(1, 15), step=1
            )
            adjustment_range = range(range_values[0], range_values[1] + 1, 2)

        elif feature_to_change == "age":
            range_values = st.slider(
                "Age Range",
                min_value=18, max_value=70,
                value=(25, 60), step=1
            )
            adjustment_range = range(range_values[0], range_values[1] + 1, 5)

        elif feature_to_change == "education":
            adjustment_range = ["High School", "Bachelor", "Master", "PhD"]

        else:  # job_title
            adjustment_range = ["Analyst", "Manager", "Director", "Senior Manager", "Data Scientist"]

    if st.button("🔍 Run What-If Analysis", type="primary"):
        try:
            with st.spinner("🤖 Analyzing scenarios..."):
                what_if_df = what_if_analysis(
                    base_input=base_profile,
                    feature_to_adjust=feature_to_change,
                    adjustment_range=adjustment_range
                )

                st.markdown("### 📊 What-If Results")

                # Display results table
                st.dataframe(what_if_df, use_container_width=True)

                # Visualization
                if feature_to_change in ["experience", "age"]:
                    fig = px.line(
                        what_if_df,
                        x=feature_to_change,
                        y='predicted_salary',
                        title=f"Salary Impact: Changing {feature_to_change.title()}",
                        markers=True
                    )
                else:
                    fig = px.bar(
                        what_if_df,
                        x=feature_to_change,
                        y='predicted_salary',
                        title=f"Salary Impact: Changing {feature_to_change.title()}"
                    )

                # Add base scenario reference
                base_salary = what_if_df['predicted_salary'].iloc[len(what_if_df)//2] if len(what_if_df) > 1 else what_if_df['predicted_salary'].iloc[0]
                if feature_to_change in ["experience", "age"]:
                    fig.add_hline(
                        y=base_salary,
                        line_dash="dash",
                        annotation_text="Reference Point"
                    )

                st.plotly_chart(fig, use_container_width=True)

                # Key insights
                best_scenario = what_if_df.loc[what_if_df['predicted_salary'].idxmax()]
                worst_scenario = what_if_df.loc[what_if_df['predicted_salary'].idxmin()]

                col1, col2 = st.columns(2)

                with col1:
                    st.markdown("#### 📈 Best Scenario")
                    st.success(
                    f"Experience: {best_scenario[feature_to_change]}\n\n"
                    f"Salary: ${best_scenario['predicted_salary']:,.0f}\n\n"
                    f"Change: ${best_scenario['change_from_base']:+,.0f}"
                    )


                with col2:
                    st.markdown("#### 📉 Worst Scenario")
                    st.error(
                    f"Experience: {worst_scenario[feature_to_change]}\n\n"
                    f"Salary: ${worst_scenario['predicted_salary']:,.0f}\n\n"
                    f"Change: ${worst_scenario['change_from_base']:+,.0f}"
                    )

        except Exception as e:
            st.error(f"❌ What-if analysis failed: {str(e)}")

def show_skill_gap_analysis():
    st.markdown('<h2 class="sub-header">🎯 Skill Gap & Career Path Analysis</h2>', unsafe_allow_html=True)

    st.markdown("""
    Discover what skills you need to advance your career and increase your salary.
    Get personalized recommendations for skill development.
    """)

    col1, col2 = st.columns(2)

    with col1:
        st.markdown("#### 💼 Current Position")
        current_job = st.selectbox(
            "Current Job Title",
            ["Analyst", "Data Analyst", "Manager", "Developer", "Engineer"]
        )

        current_skills = st.multiselect(
            "Current Skills",
            ["Python", "SQL", "Excel", "Machine Learning", "Leadership",
             "Project Management", "Data Analysis", "Statistics", "Communication"],
            default=["Excel", "Data Analysis"]
        )

        experience = st.slider(
        "Years of Experience",
        0, 30, 3
        )

        education = st.selectbox(
            "Education Level",
            ["High School", "Bachelor's", "Master's", "PhD"]
        )

    with col2:
        st.markdown("#### 🚀 Target Position")
        target_job = st.selectbox(
            "Target Job Title",
            ["Senior Analyst", "Data Scientist", "Senior Manager", "Director",
             "ML Engineer", "Principal Engineer"]
        )
        interests = st.text_input(
        "Your Interests",
        placeholder="e.g., AI, project management, team leadership"
        )


    if st.button("🎯 Analyze Skill Gap & Get Career Path Suggestions", type="primary"):
        try:
            with st.spinner("🔍 Analyzing career path..."):
                skill_gap = analyze_skill_gap(
                    current_job=current_job,
                    target_job=target_job,
                    current_skills=current_skills
                )

                st.markdown("### 📊 Career Analysis Results")

                # Key metrics
                col1, col2, col3 = st.columns(3)

                with col1:
                    st.metric(
                        "Skills Gap",
                        f"{skill_gap['missing_skills_count']} skills"
                    )

                with col2:
                    st.metric(
                        "Potential Salary Increase",
                        f"{skill_gap['potential_salary_increase_pct']:.1f}%"
                    )

                with col3:
                    est_time = skill_gap['missing_skills_count'] * 3  # 3 months per skill
                    st.metric("Estimated Learning Time", f"{est_time} months")

                # Skill recommendations
                st.markdown("#### 📚 Skill Development Recommendations")

                for i, rec in enumerate(skill_gap['recommendations']):
                    with st.expander(f"🎯 {rec['skill']} (Salary Premium: {rec['salary_premium']*100:.0f}%)"):
                        st.markdown(f"""
                        **Impact:** {rec['salary_premium']*100:.0f}% salary increase potential

                        **Learning Resources:**
                        """)

                        for resource in rec['resources']:
                            st.markdown(f"• {resource}")

                # === 🧭 New: GPT Career Advisor ===
                st.markdown("### 🧭 AI Career Advisor Suggestions")
                gpt_suggestion = suggest_career_path(
                    current_position=current_job,
                    experience=experience,  # or ask the user for real experience!
                    education=education,  # same
                    interests=interests
                )

                st.info(gpt_suggestion)

        except Exception as e:
            st.error(f"❌ Analysis failed: {str(e)}")

def show_salary_dashboard(model):
    st.markdown('<h2 class="sub-header">📈 Salary Insights Dashboard</h2>', unsafe_allow_html=True)

    # Mock data for dashboard (in real app, this would come from your dataset)
    np.random.seed(42)
    n_samples = 1000

    dashboard_data = pd.DataFrame({
    'Job_Title': np.random.choice(['Analyst', 'Manager', 'Director', 'Engineer'], n_samples),
    'Experience': np.random.randint(0, 30, n_samples),
    'Education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n_samples),
    'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], n_samples),
    'Salary': np.random.lognormal(11, 0.5, n_samples),
    'Gender': np.random.choice(['Male', 'Female'], n_samples),
    'Age': np.random.randint(20, 65, n_samples)  # Added Age column based on your data range
})

    # Filters
    st.sidebar.markdown("### 🎛️ Dashboard Filters")
    selected_jobs = st.sidebar.multiselect(
        "Job Titles",
        dashboard_data['Job_Title'].unique(),
        default=dashboard_data['Job_Title'].unique()
    )

    experience_range = st.sidebar.slider(
        "Experience Range",
        0, 30, (0, 30)
    )

    # Add Age filter
    age_range = st.sidebar.slider(
        "Age Range",
        20, 64, (20, 64)
    )

    # Filter data (updated to include age filter)
    filtered_data = dashboard_data[
        (dashboard_data['Job_Title'].isin(selected_jobs)) &
        (dashboard_data['Experience'] >= experience_range[0]) &
        (dashboard_data['Experience'] <= experience_range[1]) &
        (dashboard_data['Age'] >= age_range[0]) &
        (dashboard_data['Age'] <= age_range[1])
    ]

    # Dashboard visualizations
    col1, col2 = st.columns(2)

    with col1:
        # Salary by job title
        fig1 = px.box(
            filtered_data,
            x='Job_Title',
            y='Salary',
            title="Salary Distribution by Job Title"
        )
        st.plotly_chart(fig1, use_container_width=True)

        # Experience vs Salary
        fig3 = px.scatter(
            filtered_data,
            x='Experience',
            y='Salary',
            color='Job_Title',
            title="Experience vs Salary"
        )
        st.plotly_chart(fig3, use_container_width=True)

    with col2:
        # Salary by education
        fig2 = px.bar(
            filtered_data.groupby('Education')['Salary'].mean().reset_index(),
            x='Education',
            y='Salary',
            title="Average Salary by Education Level"
        )
        st.plotly_chart(fig2, use_container_width=True)

        # Age vs Salary (new visualization)
        fig4 = px.scatter(
            filtered_data,
            x='Age',
            y='Salary',
            color='Gender',
            title="Age vs Salary by Gender"
        )
        st.plotly_chart(fig4, use_container_width=True)

def show_bias_detection(model):
    st.markdown('<h2 class="sub-header">⚖️ Bias Detection & Fairness Analysis</h2>', unsafe_allow_html=True)

    st.markdown("""
    Analyze the model for potential bias across different demographic groups.
    This ensures fair and equitable salary predictions.
    """)

    if st.button("🔍 Run Bias Detection", type="primary"):
        try:
            with st.spinner("🤖 Analyzing model fairness..."):
                # Load same dataset as training
                df = pd.read_csv("/content/salary_prediction_data.csv")
                y = np.log1p(df['Salary'])
                X = df.drop("Salary", axis=1)
                X = add_engineered_features(X)
                sensitive_features = X['Gender']

                bias_results = detect_bias(model, X, y, sensitive_features)

                st.markdown("### 📊 Bias Analysis Results")

                mae_diff = abs(bias_results['mae']['Male'] - bias_results['mae']['Female'])

                col1, col2, col3 = st.columns(3)

                with col1:
                    st.metric("MAE Difference", f"${mae_diff:,.0f}")

                with col2:
                    bias_status = "Fair ✅" if mae_diff < 5000 else "Biased ⚠️"
                    st.metric("Fairness Status", bias_status)

                with col3:
                    st.metric("Fairness Threshold", "$5,000")

                # Detailed results
                st.markdown("#### 📋 Detailed Results")
                results_df = pd.DataFrame({
                    'Group': bias_results['mae'].index,
                    'Mean Absolute Error': bias_results['mae'].values,
                    'R² Score': bias_results['r2'].values
                })

                st.dataframe(results_df, use_container_width=True)

                # Visualization
                fig = px.bar(
                    results_df,
                    x='Group',
                    y='Mean Absolute Error',
                    title="Model Performance by Demographic Group",
                    color='Group'
                )
                st.plotly_chart(fig, use_container_width=True)

                # Recommendations
                if mae_diff < 5000:
                    st.success("""
                    ✅ **Model is Fair!**

                    The model shows no significant bias across demographic groups.
                    The MAE difference is within acceptable limits.
                    """)
                else:
                    st.warning("""
                    ⚠️ **Potential Bias Detected!**

                    Consider applying bias mitigation techniques:
                    - Re-sample training data
                    - Use fairness-aware algorithms
                    - Apply post-processing corrections
                    """)

        except Exception as e:
            st.error(f"❌ Bias detection failed: {str(e)}")

def show_about_page():
    st.markdown('<h2 class="sub-header">ℹ️ About AI Salary Predictor Pro</h2>', unsafe_allow_html=True)

    st.markdown("""
    ## 🚀 Welcome to AI Salary Predictor Pro!

    This advanced machine learning application provides comprehensive salary predictions with cutting-edge features:

    ### 🎯 Core Features
    - **🏠 Manual Prediction:** Individual salary predictions with detailed explanations
    - **📊 Batch Processing:** Upload CSV files for bulk salary predictions
    - **🔍 What-If Analysis:** Explore how different factors impact salaries
    - **🎯 Skill Gap Analysis:** Career advancement recommendations
    - **📈 Salary Dashboard:** Interactive data visualizations
    - **⚖️ Bias Detection:** Ensure fair and equitable predictions

    ### 🤖 AI Technologies Used
    - **Machine Learning:** Advanced ensemble models (Random Forest, XGBoost, etc.)
    - **XAI (Explainable AI):** SHAP values for prediction explanations
    - **Bias Detection:** Fairness metrics and mitigation techniques
    - **Feature Engineering:** Advanced data preprocessing and feature creation

    ### 📊 Model Performance
    - **Accuracy:** 94.3% R² Score
    - **Mean Absolute Error:** $8,542
    - **Cross-Validation Score:** 91.7%
    - **Bias Status:** Fair across all demographic groups ✅

    ### 🛠️ Technical Stack
    - **Frontend:** Streamlit
    - **ML Framework:** Scikit-learn, XGBoost
    - **Visualization:** Plotly
    - **Data Processing:** Pandas, NumPy

    ### 👨‍💻 Created by Arup
    **Personal Organization** | **July 2025**

    ---

    ### 📖 How to Use
    1. **Manual Prediction:** Enter individual employee details for instant salary prediction
    2. **Batch Prediction:** Upload CSV file with multiple employees for bulk analysis
    3. **What-If Analysis:** Experiment with different scenarios to understand salary drivers
    4. **Skill Gap Analysis:** Get personalized career development recommendations
    5. **Dashboard:** Explore salary trends and insights
    6. **Bias Check:** Verify model fairness across demographic groups

    ### 🎓 Educational Value
    This application demonstrates:
    - End-to-end ML pipeline development
    - Responsible AI practices
    - Interactive web application development
    - Data visualization and storytelling
    - Bias detection and mitigation

    **Ready to explore your salary potential?** 🚀
    """)

def convert_df_to_csv(df):
    return df.to_csv(index=False).encode('utf-8')

if __name__ == "__main__":
    main()

Overwriting app.py


In [None]:
!streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.89.117:8501[0m
[0m
[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0Kyour url is: https://salty-teeth-tan.loca.lt
