In [None]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import pandas as pd


print("Loading dataset...")
df = pd.read_csv('eda_data.csv')


Loading dataset...


Unnamed: 0.1,Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,...,age,python_yn,R_yn,spark,aws,excel,job_simp,seniority,desc_len,num_comp
0,0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,...,47,1,0,0,0,1,data scientist,na,2536,0
1,1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,...,36,1,0,0,0,0,data scientist,na,4783,0
2,2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,...,10,1,0,1,0,1,data scientist,na,3461,0
3,3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,...,55,1,0,0,0,0,data scientist,na,3883,3
4,4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,...,22,1,0,0,0,1,data scientist,na,2728,3


In [None]:
features = ['Rating', 'python_yn', 'job_simp', 'seniority']
df_model = df[features + ['avg_salary']].copy()

le_job = LabelEncoder()
df_model['job_simp_encoded'] = le_job.fit_transform(df_model['job_simp'].astype(str))

le_seniority = LabelEncoder()
df_model['seniority_encoded'] = le_seniority.fit_transform(df_model['seniority'].astype(str))

df_model.head()

Unnamed: 0,Rating,python_yn,job_simp,seniority,avg_salary,job_simp_encoded,seniority_encoded
0,3.8,1,data scientist,na,72.0,2,1
1,3.4,1,data scientist,na,87.5,2,1
2,4.8,1,data scientist,na,85.0,2,1
3,3.8,1,data scientist,na,76.5,2,1
4,2.9,1,data scientist,na,114.5,2,1


In [None]:

X = df_model[['Rating', 'python_yn', 'job_simp_encoded', 'seniority_encoded']]
y = df_model['avg_salary']

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

print("Model training complete!")

Model training complete!


In [None]:

user_rating = 3.5
user_python = 1
user_job = 'data scientist'
user_seniority = 'senior'


user_job_encoded = le_job.transform([user_job])[0]
user_seniority_encoded = le_seniority.transform([user_seniority])[0]


prediction_df = pd.DataFrame(
    [[user_rating, user_python, user_job_encoded, user_seniority_encoded]], 
    columns=X.columns
)
prediction = model.predict(prediction_df)[0]

print(f"Based on your inputs:")
print(f"- Job: {user_job.title()} ({user_seniority.title()})")
print(f"- Rating: {user_rating}")
print(f"- Knows Python: {'Yes' if user_python == 1 else 'No'}")
print(f" Predicted Average Salary: ${prediction * 1000:,.2f}")

Based on your inputs:
- Job: Data Scientist (Senior)
- Rating: 3.5
- Knows Python: Yes
 Predicted Average Salary: $133,703.27


In [None]:
# Interactive Salary Predictor
import ipywidgets as widgets
from IPython.display import display, clear_output

# Create input widgets
job_dropdown = widgets.Dropdown(
    options=list(le_job.classes_),
    description='Job Title:',
    disabled=False,
)

seniority_dropdown = widgets.Dropdown(
    options=list(le_seniority.classes_),
    description='Seniority:',
    disabled=False,
)

python_checkbox = widgets.Checkbox(
    value=False,
    description='Knows Python',
    disabled=False,
    indent=False
)

rating_slider = widgets.FloatSlider(
    value=3.5,
    min=1.0,
    max=5.0,
    step=0.1,
    description='Co. Rating:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='.1f',
)

predict_button = widgets.Button(description="Predict Salary!", button_style='success')
output_area = widgets.Output()

def on_predict_clicked(b):
    with output_area:
        clear_output()
        try:
            job = job_dropdown.value
            seniority = seniority_dropdown.value
            python_yn = 1 if python_checkbox.value else 0
            rating = rating_slider.value
            
            job_encoded = le_job.transform([job])[0]
            seniority_encoded = le_seniority.transform([seniority])[0]
            
            pred_df = pd.DataFrame(
                [[rating, python_yn, job_encoded, seniority_encoded]], 
                columns=model.feature_names if hasattr(model, 'feature_names') else X.columns
            )
            
            base_pred = model.predict(pred_df)[0]
            rating_multiplier = 1.0 + ((rating - 3.0) * 0.1)
            adjusted_pred = base_pred * rating_multiplier
            salary = adjusted_pred * 1000
            
            print("="*30)
            print(f"Predicted Salary: ${salary:,.2f}")
            print("="*30)
        except Exception as e:
            print(f"Error making prediction: {e}")

predict_button.on_click(on_predict_clicked)

# Display UI
ui = widgets.VBox([job_dropdown, seniority_dropdown, rating_slider, python_checkbox, predict_button, output_area])
display(ui)
