<a href="https://colab.research.google.com/github/sarayutallady/Employee-Salary-Prediction/blob/main/Employee_Salary_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install pandas numpy scikit-learn matplotlib seaborn plotly
!pip install xgboost lightgbm streamlit
!pip install plotly-dash wordcloud feature-engine

[31mERROR: Could not find a version that satisfies the requirement plotly-dash (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for plotly-dash[0m[31m
[0m

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
import lightgbm as lgb

# Set styling
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
np.random.seed(42)

print("🎯 Libraries imported successfully!")


🎯 Libraries imported successfully!


In [10]:
def create_enhanced_salary_dataset(n_samples=12000):
    """Create a comprehensive employee salary dataset with realistic features"""

    np.random.seed(42)

    # Demographics
    age = np.random.normal(35, 10, n_samples).astype(int)
    age = np.clip(age, 22, 65)

    gender = np.random.choice(['Male', 'Female'], n_samples, p=[0.6, 0.4])

    # Education levels
    education_levels = ['Bachelor', 'Master', 'PhD', 'Professional']
    education = np.random.choice(education_levels, n_samples, p=[0.45, 0.35, 0.15, 0.05])

    # Experience
    years_experience = np.maximum(0, age - 22 - np.random.normal(2, 2, n_samples)).astype(int)
    years_experience = np.clip(years_experience, 0, 35)

    # Job categories
    job_titles = ['Data Scientist', 'Software Engineer', 'Product Manager', 'Marketing Manager',
                  'DevOps Engineer', 'Business Analyst', 'UX Designer', 'Sales Manager',
                  'Financial Analyst', 'HR Manager', 'Operations Manager', 'Consultant']
    job_title = np.random.choice(job_titles, n_samples)

    # Job levels based on experience
    job_level = np.where(years_experience <= 2, 'Junior',
                np.where(years_experience <= 5, 'Mid-level',
                np.where(years_experience <= 10, 'Senior',
                np.where(years_experience <= 15, 'Lead', 'Director'))))

    # Company details
    company_sizes = ['Small', 'Medium', 'Large', 'Enterprise']
    company_size = np.random.choice(company_sizes, n_samples, p=[0.25, 0.35, 0.25, 0.15])

    # Locations
    locations = ['San Francisco', 'New York', 'Seattle', 'Austin', 'Boston', 'Chicago', 'Denver', 'Atlanta']
    location = np.random.choice(locations, n_samples)

    # Industries
    industries = ['Technology', 'Finance', 'Healthcare', 'Consulting', 'Manufacturing', 'Media']
    industry = np.random.choice(industries, n_samples)

    # Skills and performance
    technical_skills = np.random.normal(7, 2, n_samples)
    technical_skills = np.clip(technical_skills, 1, 10).round(1)

    communication_skills = np.random.normal(7.5, 1.5, n_samples)
    communication_skills = np.clip(communication_skills, 1, 10).round(1)

    performance_score = np.random.choice([3, 4, 5], n_samples, p=[0.2, 0.6, 0.2])

    certifications = np.random.poisson(2, n_samples)
    certifications = np.clip(certifications, 0, 8)

    # Calculate salary with realistic relationships
    base_salary = 50000

    # Experience impact (exponential early career, linear later)
    exp_bonus = np.where(years_experience <= 5,
                        years_experience * 4000 + (years_experience ** 1.8) * 500,
                        5 * 4000 + (5 ** 1.8) * 500 + (years_experience - 5) * 3000)

    # Education multiplier
    edu_multiplier = {'Bachelor': 1.0, 'Master': 1.2, 'PhD': 1.4, 'Professional': 1.3}
    edu_bonus = np.array([edu_multiplier[edu] for edu in education]) * 20000

    # Job title impact
    job_multiplier = {
        'Data Scientist': 1.4, 'Software Engineer': 1.3, 'DevOps Engineer': 1.25,
        'Product Manager': 1.3, 'Consultant': 1.2, 'UX Designer': 1.15,
        'Business Analyst': 1.1, 'Financial Analyst': 1.15, 'Marketing Manager': 1.1,
        'Sales Manager': 1.2, 'HR Manager': 1.0, 'Operations Manager': 1.05
    }
    job_bonus = np.array([job_multiplier[job] for job in job_title]) * 25000

    # Level impact
    level_multiplier = {'Junior': 1.0, 'Mid-level': 1.4, 'Senior': 1.8, 'Lead': 2.3, 'Director': 2.8}
    level_bonus = np.array([level_multiplier[level] for level in job_level]) * 15000

    # Company size impact
    size_multiplier = {'Small': 0.9, 'Medium': 1.0, 'Large': 1.2, 'Enterprise': 1.35}
    size_bonus = np.array([size_multiplier[size] for size in company_size]) * 18000

    # Location cost of living
    location_multiplier = {
        'San Francisco': 1.5, 'New York': 1.4, 'Seattle': 1.25, 'Boston': 1.2,
        'Austin': 1.1, 'Denver': 1.05, 'Chicago': 1.0, 'Atlanta': 0.95
    }
    location_bonus = np.array([location_multiplier[loc] for loc in location]) * 20000

    # Industry impact
    industry_multiplier = {'Technology': 1.3, 'Finance': 1.25, 'Consulting': 1.2,
                          'Healthcare': 1.1, 'Media': 1.05, 'Manufacturing': 1.0}
    industry_bonus = np.array([industry_multiplier[ind] for ind in industry]) * 12000

    # Skills and performance impact
    skills_bonus = (technical_skills + communication_skills) * 1500
    performance_bonus = (performance_score - 3) * 8000
    cert_bonus = certifications * 2500

    # Calculate final salary
    salary = (base_salary + exp_bonus + edu_bonus + job_bonus + level_bonus +
              size_bonus + location_bonus + industry_bonus + skills_bonus +
              performance_bonus + cert_bonus)

    # Add realistic noise and constraints
    salary = salary + np.random.normal(0, 8000, n_samples)
    salary = np.maximum(salary, 40000)
    salary = np.minimum(salary, 400000)
    salary = salary.astype(int)

    # Create DataFrame
    df = pd.DataFrame({
        'Age': age,
        'Gender': gender,
        'Education_Level': education,
        'Years_Experience': years_experience,
        'Job_Title': job_title,
        'Job_Level': job_level,
        'Company_Size': company_size,
        'Location': location,
        'Industry': industry,
        'Technical_Skills': technical_skills,
        'Communication_Skills': communication_skills,
        'Performance_Score': performance_score,
        'Certifications': certifications,
        'Salary': salary
    })

    return df

# Create dataset
print("🔄 Creating enhanced salary dataset...")
df = create_enhanced_salary_dataset(12000)
print(f"✅ Dataset created! Shape: {df.shape}")
print("\n📊 Dataset Preview:")
print(df.head())



🔄 Creating enhanced salary dataset...
✅ Dataset created! Shape: (12000, 14)

📊 Dataset Preview:
   Age  Gender Education_Level  Years_Experience          Job_Title Job_Level  \
0   39    Male             PhD                17  Marketing Manager  Director   
1   33    Male        Bachelor                10  Software Engineer    Senior   
2   41    Male          Master                15   Business Analyst      Lead   
3   50    Male        Bachelor                22         HR Manager  Director   
4   32  Female        Bachelor                 7  Financial Analyst    Senior   

  Company_Size       Location    Industry  Technical_Skills  \
0        Large         Denver       Media               5.6   
1        Small  San Francisco  Healthcare               8.2   
2        Small       New York     Finance               2.1   
3        Small         Austin  Consulting               6.1   
4   Enterprise       New York     Finance               7.7   

   Communication_Skills  Performance_S

In [11]:

print("📈 Dataset Analysis:")
print("="*50)
print(f"📊 Total Records: {len(df):,}")
print(f"💰 Average Salary: ${df['Salary'].mean():,.0f}")
print(f"📈 Salary Range: ${df['Salary'].min():,} - ${df['Salary'].max():,}")
print(f"🎯 Missing Values: {df.isnull().sum().sum()}")

print(f"\n👥 Demographics:")
print(f"• Average Age: {df['Age'].mean():.1f} years")
print(f"• Average Experience: {df['Years_Experience'].mean():.1f} years")
print(f"• Gender Distribution: {df['Gender'].value_counts().to_dict()}")

print(f"\n🏢 Job Distribution:")
print(f"• Top Job Levels: {df['Job_Level'].value_counts().head(3).to_dict()}")
print(f"• Top Industries: {df['Industry'].value_counts().head(3).to_dict()}")

📈 Dataset Analysis:
📊 Total Records: 12,000
💰 Average Salary: $266,524
📈 Salary Range: $165,031 - $394,313
🎯 Missing Values: 0

👥 Demographics:
• Average Age: 34.9 years
• Average Experience: 10.8 years
• Gender Distribution: {'Male': 7057, 'Female': 4943}

🏢 Job Distribution:
• Top Job Levels: {'Director': 3488, 'Junior': 2771, 'Senior': 2328}
• Top Industries: {'Consulting': 2044, 'Manufacturing': 2028, 'Healthcare': 2008}


In [12]:
print("🎨 Creating Interactive Visualization Dashboard...")

# 1. Comprehensive Salary Analysis Dashboard
fig_dashboard = make_subplots(
    rows=2, cols=3,
    subplot_titles=('💰 Salary Distribution', '🎯 Experience Impact', '🎓 Education Effect',
                   '📍 Location Comparison', '🏢 Industry Analysis', '⭐ Performance vs Salary'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}, {"secondary_y": False}]]
)

# Salary Distribution with KDE
fig_dashboard.add_trace(
    go.Histogram(x=df['Salary'], nbinsx=40, name='Salary',
                marker_color='lightblue', opacity=0.7),
    row=1, col=1
)

# Experience vs Salary with trend line
fig_dashboard.add_trace(
    go.Scatter(x=df['Years_Experience'], y=df['Salary'], mode='markers',
              marker=dict(size=4, opacity=0.6, color='green'),
              name='Experience'),
    row=1, col=2
)

# Add trend line
z = np.polyfit(df['Years_Experience'], df['Salary'], 1)
p = np.poly1d(z)
fig_dashboard.add_trace(
    go.Scatter(x=df['Years_Experience'], y=p(df['Years_Experience']),
              mode='lines', line=dict(color='red', width=2),
              name='Trend', showlegend=False),
    row=1, col=2
)

# Education Level Box Plot
education_order = ['Bachelor', 'Master', 'PhD', 'Professional']
for i, edu in enumerate(education_order):
    if edu in df['Education_Level'].values:
        data = df[df['Education_Level'] == edu]['Salary']
        fig_dashboard.add_trace(
            go.Box(y=data, name=edu, showlegend=False),
            row=1, col=3
        )

# Location Analysis
location_avg = df.groupby('Location')['Salary'].mean().sort_values(ascending=True)
fig_dashboard.add_trace(
    go.Bar(x=location_avg.values, y=location_avg.index,
           orientation='h', marker_color='orange',
           showlegend=False),
    row=2, col=1
)

# Industry Violin Plot
industries = df['Industry'].value_counts().head(4).index
colors_industry = ['red', 'blue', 'green', 'purple']
for i, industry in enumerate(industries):
    data = df[df['Industry'] == industry]['Salary']
    fig_dashboard.add_trace(
        go.Violin(y=data, name=industry,
                 line_color=colors_industry[i],
                 showlegend=False),
        row=2, col=2
    )

# Performance Score Impact
perf_avg = df.groupby('Performance_Score')['Salary'].mean()
fig_dashboard.add_trace(
    go.Scatter(x=perf_avg.index, y=perf_avg.values,
              mode='markers+lines', marker=dict(size=10),
              line=dict(width=3), marker_color='purple',
              showlegend=False),
    row=2, col=3
)

fig_dashboard.update_layout(
    height=800,
    title_text="🎨 Interactive Salary Analysis Dashboard",
    showlegend=False
)
fig_dashboard.show()

# 2. 3D Salary Landscape
print("🌟 Creating 3D Salary Visualization...")

# Sample data for performance
sample_df = df.sample(2000)

fig_3d = go.Figure(data=[go.Scatter3d(
    x=sample_df['Years_Experience'],
    y=sample_df['Age'],
    z=sample_df['Salary'],
    mode='markers',
    marker=dict(
        size=5,
        color=sample_df['Technical_Skills'],
        colorscale='Viridis',
        opacity=0.8,
        colorbar=dict(title="Technical Skills"),
        line=dict(width=0.5, color='white')
    ),
    text=sample_df['Job_Title'],
    hovertemplate="<b>%{text}</b><br>" +
                  "Experience: %{x} years<br>" +
                  "Age: %{y}<br>" +
                  "Salary: $%{z:,.0f}<br>" +
                  "<extra></extra>"
)])

fig_3d.update_layout(
    title="🚀 3D Salary Landscape: Experience × Age × Salary",
    scene=dict(
        xaxis_title="Years of Experience",
        yaxis_title="Age",
        zaxis_title="Salary ($)",
        camera=dict(eye=dict(x=1.2, y=1.2, z=1.2))
    ),
    width=900,
    height=700
)
fig_3d.show()

# 3. Creative Radar Chart for Skills Analysis
print("📡 Creating Skills Radar Chart...")

# Analyze top-paying jobs
top_jobs = df.groupby('Job_Title')['Salary'].mean().nlargest(6).index
skills_analysis = df[df['Job_Title'].isin(top_jobs)].groupby('Job_Title').agg({
    'Technical_Skills': 'mean',
    'Communication_Skills': 'mean',
    'Certifications': 'mean',
    'Performance_Score': 'mean',
    'Years_Experience': 'mean',
    'Salary': 'mean'
}).round(2)

fig_radar = go.Figure()

for job in skills_analysis.index:
    values = [
        skills_analysis.loc[job, 'Technical_Skills'],
        skills_analysis.loc[job, 'Communication_Skills'],
        skills_analysis.loc[job, 'Certifications'] * 2,  # Scale for visibility
        skills_analysis.loc[job, 'Performance_Score'] * 2,
        skills_analysis.loc[job, 'Years_Experience'] / 2,  # Scale down
    ]

    fig_radar.add_trace(go.Scatterpolar(
        r=values + [values[0]],  # Close the polygon
        theta=['Technical Skills', 'Communication', 'Certifications', 'Performance', 'Experience', 'Technical Skills'],
        fill='toself',
        name=job,
        opacity=0.6
    ))

fig_radar.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 10]
        )),
    title="📡 Skills Profile: Top-Paying Jobs",
    width=800,
    height=700
)
fig_radar.show()


🎨 Creating Interactive Visualization Dashboard...


🌟 Creating 3D Salary Visualization...


📡 Creating Skills Radar Chart...


In [13]:
print("🔧 Advanced Feature Engineering...")

# Create enhanced features
df_enhanced = df.copy()

# Experience-based features
df_enhanced['Experience_Age_Ratio'] = df_enhanced['Years_Experience'] / df_enhanced['Age']
df_enhanced['Experience_Squared'] = df_enhanced['Years_Experience'] ** 2
df_enhanced['Career_Stage'] = pd.cut(df_enhanced['Years_Experience'],
                                   bins=[0, 2, 5, 10, 20, 50],
                                   labels=['Entry', 'Junior', 'Mid', 'Senior', 'Executive'])

# Skills features
df_enhanced['Total_Skills'] = df_enhanced['Technical_Skills'] + df_enhanced['Communication_Skills']
df_enhanced['Skills_Balance'] = abs(df_enhanced['Technical_Skills'] - df_enhanced['Communication_Skills'])
df_enhanced['Skills_Experience_Ratio'] = df_enhanced['Total_Skills'] / np.maximum(df_enhanced['Years_Experience'], 1)

# Performance features
df_enhanced['High_Performer'] = (df_enhanced['Performance_Score'] >= 4).astype(int)
df_enhanced['Cert_Per_Year'] = df_enhanced['Certifications'] / np.maximum(df_enhanced['Years_Experience'], 1)

# Location cost of living
col_index = {
    'San Francisco': 1.5, 'New York': 1.4, 'Seattle': 1.25, 'Boston': 1.2,
    'Austin': 1.1, 'Denver': 1.05, 'Chicago': 1.0, 'Atlanta': 0.95
}
df_enhanced['Cost_of_Living'] = df_enhanced['Location'].map(col_index)
df_enhanced['Salary_Adjusted'] = df_enhanced['Salary'] / df_enhanced['Cost_of_Living']

print(f"✅ Created {len(df_enhanced.columns) - len(df.columns)} new features")
print(f"📊 Total features: {len(df_enhanced.columns)}")

# CELL 7: Model Development Pipeline
# =============================================================================
print("🤖 Developing Machine Learning Models...")

# Prepare features and target
feature_cols = ['Age', 'Years_Experience', 'Technical_Skills', 'Communication_Skills',
               'Performance_Score', 'Certifications', 'Experience_Age_Ratio',
               'Total_Skills', 'Skills_Balance', 'High_Performer']

categorical_cols = ['Gender', 'Education_Level', 'Job_Title', 'Job_Level',
                   'Company_Size', 'Location', 'Industry', 'Career_Stage']

# Combine features
X = df_enhanced[feature_cols + categorical_cols]
y = df_enhanced['Salary']

print(f"📊 Feature matrix shape: {X.shape}")
print(f"🎯 Target shape: {y.shape}")

# Preprocessing pipeline
numerical_features = feature_cols
categorical_features = categorical_cols

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ]
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=df_enhanced['Job_Level']
)

print(f"✅ Training set: {X_train.shape[0]} samples")
print(f"✅ Test set: {X_test.shape[0]} samples")

🔧 Advanced Feature Engineering...
✅ Created 10 new features
📊 Total features: 24
🤖 Developing Machine Learning Models...
📊 Feature matrix shape: (12000, 18)
🎯 Target shape: (12000,)
✅ Training set: 9600 samples
✅ Test set: 2400 samples


In [14]:
print("🚀 Training Advanced Models...")

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1, verbose=-1)
}

# Training and evaluation
results = {}
trained_models = {}

def calculate_metrics(y_true, y_pred):
    """Calculate comprehensive metrics"""
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)

    # Accuracy within different thresholds
    percentage_errors = np.abs((y_true - y_pred) / y_true) * 100
    acc_10 = (percentage_errors <= 10).mean() * 100
    acc_15 = (percentage_errors <= 15).mean() * 100

    return {
        'R²': r2,
        'RMSE': rmse,
        'MAE': mae,
        'Accuracy_10%': acc_10,
        'Accuracy_15%': acc_15,
        'MAPE': percentage_errors.mean()
    }

print("Training models...")
for name, model in models.items():
    try:
        # Create pipeline
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])

        # Train
        pipeline.fit(X_train, y_train)

        # Predict
        y_pred_test = pipeline.predict(X_test)

        # Calculate metrics
        metrics = calculate_metrics(y_test, y_pred_test)
        results[name] = metrics
        trained_models[name] = pipeline

        print(f"✅ {name}: R² = {metrics['R²']:.4f}, RMSE = ${metrics['RMSE']:.0f}")

    except Exception as e:
        print(f"❌ {name} failed: {str(e)}")

🚀 Training Advanced Models...
Training models...
✅ Linear Regression: R² = 0.9635, RMSE = $8051
✅ Ridge Regression: R² = 0.9635, RMSE = $8052
✅ Random Forest: R² = 0.9465, RMSE = $9758
✅ Gradient Boosting: R² = 0.9578, RMSE = $8661
✅ XGBoost: R² = 0.9557, RMSE = $8876
✅ LightGBM: R² = 0.9604, RMSE = $8386


In [15]:
print("📊 Creating Model Performance Dashboard...")

# Results DataFrame
results_df = pd.DataFrame(results).T.round(4)
print("\n🏆 Model Performance Summary:")
print("="*70)
print(results_df)

# Performance comparison visualization
fig_perf = make_subplots(
    rows=1, cols=3,
    subplot_titles=('🎯 R² Score', '💰 RMSE ($)', '✅ Accuracy (±10%)')
)

models_list = list(results.keys())
r2_scores = [results[m]['R²'] for m in models_list]
rmse_scores = [results[m]['RMSE'] for m in models_list]
accuracy_scores = [results[m]['Accuracy_10%'] for m in models_list]

# R² scores
fig_perf.add_trace(
    go.Bar(x=models_list, y=r2_scores, name='R²',
           marker_color='lightblue', text=[f"{x:.3f}" for x in r2_scores],
           textposition='auto'),
    row=1, col=1
)

# RMSE scores
fig_perf.add_trace(
    go.Bar(x=models_list, y=rmse_scores, name='RMSE',
           marker_color='lightcoral', text=[f"${x:.0f}" for x in rmse_scores],
           textposition='auto'),
    row=1, col=2
)

# Accuracy scores
fig_perf.add_trace(
    go.Bar(x=models_list, y=accuracy_scores, name='Accuracy',
           marker_color='lightgreen', text=[f"{x:.1f}%" for x in accuracy_scores],
           textposition='auto'),
    row=1, col=3
)

fig_perf.update_layout(
    height=500,
    title_text="🏆 Model Performance Comparison",
    showlegend=False
)
fig_perf.update_xaxes(tickangle=45)
fig_perf.show()

# Best model analysis
best_model_name = max(results.keys(), key=lambda x: results[x]['R²'])
best_model = trained_models[best_model_name]
best_metrics = results[best_model_name]

print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"📊 R² Score: {best_metrics['R²']:.4f} ({best_metrics['R²']*100:.1f}% variance explained)")
print(f"💰 RMSE: ${best_metrics['RMSE']:.2f}")
print(f"🎯 Accuracy (±10%): {best_metrics['Accuracy_10%']:.1f}%")
print(f"📈 MAPE: {best_metrics['MAPE']:.2f}%")

📊 Creating Model Performance Dashboard...

🏆 Model Performance Summary:
                       R²       RMSE        MAE  Accuracy_10%  Accuracy_15%  \
Linear Regression  0.9635  8051.3687  6413.2610       99.6667       99.9583   
Ridge Regression   0.9635  8051.6590  6412.4332       99.6667       99.9583   
Random Forest      0.9465  9757.6569  7730.8241       98.5000       99.9167   
Gradient Boosting  0.9578  8661.0270  6846.3492       99.2083       99.9167   
XGBoost            0.9557  8876.3461  7024.3452       98.9583       99.9583   
LightGBM           0.9604  8386.3450  6639.3186       99.4167       99.9583   

                     MAPE  
Linear Regression  2.4739  
Ridge Regression   2.4735  
Random Forest      2.9772  
Gradient Boosting  2.6442  
XGBoost            2.7029  
LightGBM           2.5560  



🏆 BEST MODEL: Linear Regression
📊 R² Score: 0.9635 (96.4% variance explained)
💰 RMSE: $8051.37
🎯 Accuracy (±10%): 99.7%
📈 MAPE: 2.47%


In [17]:
if best_model_name in ['Random Forest', 'Gradient Boosting', 'XGBoost', 'LightGBM']:
    print(f"🔍 Analyzing {best_model_name} Feature Importance...")

    # Get feature names
    num_features = numerical_features
    cat_features = list(best_model.named_steps['preprocessor']
                       .named_transformers_['cat']
                       .get_feature_names_out(categorical_features))
    all_features = num_features + cat_features

    # Get importance
    importance = best_model.named_steps['regressor'].feature_importances_

    # Create importance DataFrame
    importance_df = pd.DataFrame({
        'Feature': all_features,
        'Importance': importance
    }).sort_values('Importance', ascending=False)

    # Top 15 features visualization
    top_15 = importance_df.head(15)

    fig_importance = go.Figure(go.Bar(
        x=top_15['Importance'],
        y=top_15['Feature'],
        orientation='h',
        marker_color='skyblue',
        text=[f"{x:.3f}" for x in top_15['Importance']],
        textposition='auto'
    ))

    fig_importance.update_layout(
        title=f"🎯 Top 15 Feature Importance ({best_model_name})",
        xaxis_title="Importance Score",
        height=600,
        yaxis={'categoryorder': 'total ascending'}
    )
    fig_importance.show()

    print("📋 Top 10 Most Important Features:")
    for i, (_, row) in enumerate(top_15.head(10).iterrows(), 1):
        print(f"{i:2d}. {row['Feature']:<25} {row['Importance']:.4f}")



In [20]:
print("🔮 Creating Salary Prediction Function...")

def predict_salary_detailed(age, gender, education, experience, job_title, job_level, company_size, location, industry, tech_skills, comm_skills, performance, certifications):
    """
    Comprehensive salary prediction function
    """
    # Create input DataFrame
    input_data = pd.DataFrame({
        'Age': [age],
        'Gender': [gender],
        'Education_Level': [education],
        'Years_Experience': [experience],
        'Job_Title': [job_title],
        'Job_Level': [job_level],
        'Company_Size': [company_size],
        'Location': [location],
        'Industry': [industry],
        'Technical_Skills': [tech_skills],
        'Communication_Skills': [comm_skills],
        'Performance_Score': [performance],
        'Certifications': [certifications]
    })

    # Calculate engineered features
    input_data['Experience_Age_Ratio'] = input_data['Years_Experience'] / input_data['Age']
    input_data['Total_Skills'] = input_data['Technical_Skills'] + input_data['Communication_Skills']
    input_data['Skills_Balance'] = abs(input_data['Technical_Skills'] - input_data['Communication_Skills'])
    input_data['High_Performer'] = (input_data['Performance_Score'] >= 4).astype(int)
    input_data['Career_Stage'] = pd.cut(input_data['Years_Experience'],
                                       bins=[0, 2, 5, 10, 20, 50],
                                       labels=['Entry', 'Junior', 'Mid', 'Senior', 'Executive'])

    # Select features in correct order
    input_features = input_data[feature_cols + categorical_cols]

    # Make prediction
    prediction = best_model.predict(input_features)[0]

    # Calculate confidence interval (simple approximation)
    std_error = best_metrics['RMSE']
    lower_bound = prediction - 1.96 * std_error
    upper_bound = prediction + 1.96 * std_error

    return {
        'predicted_salary': round(prediction),
        'confidence_interval': (round(lower_bound), round(upper_bound)),
        'model_used': best_model_name,
        'model_accuracy': f"{best_metrics['Accuracy_10%']:.1f}%"
    }

# Example predictions
print("\n🎯 Example Salary Predictions:")
print("="*60)

examples = [
    {
        'profile': "Senior Data Scientist in Tech",
        'params': (32, 'Male', 'Master', 7, 'Data Scientist', 'Senior', 'Large',
                  'San Francisco', 'Technology', 8.5, 7.8, 4, 3)
    },
    {
        'profile': "Junior Software Engineer",
        'params': (25, 'Female', 'Bachelor', 2, 'Software Engineer', 'Junior', 'Medium',
                  'Austin', 'Technology', 7.2, 6.5, 3, 1)
    },
    {
        'profile': "Lead Product Manager",
        'params': (38, 'Male', 'Master', 12, 'Product Manager', 'Lead', 'Enterprise',
                  'New York', 'Technology', 7.5, 9.2, 4, 2)
    }
]

# Generate and display predictions
for example in examples:
    try:
        result = predict_salary_detailed(*example['params'])

        print(f"\n📊 {example['profile']}:")
        print(f"   💰 Predicted Salary: ${result['predicted_salary']:,}")
        print(f"   📈 Confidence Range: ${result['confidence_interval'][0]:,} - ${result['confidence_interval'][1]:,}")
        print(f"   🤖 Model: {result['model_used']} (Accuracy: {result['model_accuracy']})")

    except Exception as e:
        print(f"\n❌ Error predicting for {example['profile']}: {str(e)}")

print("\n" + "="*60)
print("🎉 Salary Prediction System Ready!")

# Interactive prediction function
def interactive_salary_prediction():
    """
    Interactive function to get user input and predict salary
    """
    print("\n🔮 Interactive Salary Predictor")
    print("-" * 40)

    try:
        # Collect user inputs
        age = int(input("Enter age: "))
        gender = input("Enter gender (Male/Female/Other): ")
        education = input("Enter education level (High School/Bachelor/Master/PhD): ")
        experience = int(input("Enter years of experience: "))
        job_title = input("Enter job title: ")
        job_level = input("Enter job level (Entry/Junior/Mid/Senior/Lead/Executive): ")
        company_size = input("Enter company size (Small/Medium/Large/Enterprise): ")
        location = input("Enter location: ")
        industry = input("Enter industry: ")
        tech_skills = float(input("Enter technical skills rating (1-10): "))
        comm_skills = float(input("Enter communication skills rating (1-10): "))
        performance = float(input("Enter performance score (1-5): "))
        certifications = int(input("Enter number of certifications: "))

        # Make prediction
        result = predict_salary_detailed(
            age, gender, education, experience, job_title, job_level,
            company_size, location, industry, tech_skills, comm_skills,
            performance, certifications
        )

        # Display results
        print(f"\n🎯 Your Predicted Salary Results:")
        print(f"💰 Predicted Salary: ${result['predicted_salary']:,}")
        print(f"📈 Confidence Range: ${result['confidence_interval'][0]:,} - ${result['confidence_interval'][1]:,}")
        print(f"🤖 Model Used: {result['model_used']}")
        print(f"📊 Model Accuracy: {result['model_accuracy']}")

    except Exception as e:
        print(f"❌ Error: {str(e)}")


# Uncomment to run interactive prediction
# interactive_salary_prediction()


🔮 Creating Salary Prediction Function...

🎯 Example Salary Predictions:

📊 Senior Data Scientist in Tech:
   💰 Predicted Salary: $278,868
   📈 Confidence Range: $263,087 - $294,649
   🤖 Model: Linear Regression (Accuracy: 99.7%)

📊 Junior Software Engineer:
   💰 Predicted Salary: $205,087
   📈 Confidence Range: $189,306 - $220,867
   🤖 Model: Linear Regression (Accuracy: 99.7%)

📊 Lead Product Manager:
   💰 Predicted Salary: $296,943
   📈 Confidence Range: $281,162 - $312,723
   🤖 Model: Linear Regression (Accuracy: 99.7%)

🎉 Salary Prediction System Ready!


In [21]:
pip install streamlit pandas numpy plotly



In [24]:
# Create salary_app.py directly in Colab
app_code = '''
# Paste the ENTIRE artifact code here (all the Streamlit code)
'''

# Save to file
with open('salary_app.py', 'w') as f:
    f.write(app_code)

print(" salary_app.py created successfully!")

# Verify the file exists
import os
if os.path.exists('salary_app.py'):
    print(f" File size: {os.path.getsize('salary_app.py')} bytes")
    print(" Ready to run with Streamlit!")
else:
    print(" File creation failed")

 salary_app.py created successfully!
 File size: 64 bytes
 Ready to run with Streamlit!


In [36]:
# Run Streamlit app
import subprocess
import threading
import time

def run_app():
    subprocess.run(['streamlit', 'run', 'salary_app.py', '--server.port', '8501', '--server.headless', 'true'])

# Start the app in background
thread = threading.Thread(target=run_app, daemon=True)
thread.start()

time.sleep(3)
print("🚀 Streamlit app is running!")
print("📱 Access it at: http://localhost:8501")

🚀 Streamlit app is running!
📱 Access it at: http://localhost:8501
