In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score

print("Loading dataset...")
try:
    data = pd.read_csv("eda_data.csv")
except:
    print("ERROR: eda_data.csv not found in this folder!")
    exit()

# SELECT IMPORTANT COLUMNS 
data = data[['Rating','python_yn','spark','aws','excel','seniority','job_simp','avg_salary']]

# remove missing values
data = data.dropna()

# CLEAN TEXT 
data['job_simp'] = data['job_simp'].astype(str).str.lower().str.strip()
data['seniority'] = data['seniority'].astype(str).str.lower().str.strip()

# ENCODE TEXT 
le_job = LabelEncoder()
le_sen = LabelEncoder()

data['job_simp'] = le_job.fit_transform(data['job_simp'])
data['seniority'] = le_sen.fit_transform(data['seniority'])

# FEATURES & TARGET 
X = data[['Rating','python_yn','spark','aws','excel','seniority','job_simp']]
y = data['avg_salary']

# TRAIN TEST SPLIT 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# TRAIN MODEL 
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train,y_train)

# CHECK ACCURACY 
preds = model.predict(X_test)
accuracy = r2_score(y_test,preds)

print("\nModel trained successfully!")
print("Model Accuracy (R2 Score):", round(accuracy,3))


print("\n========== SALARY PREDICTOR ==========")

rating = float(input("Company Rating (1-5): "))
python_skill = int(input("Python skill? (1=yes,0=no): "))
spark_skill = int(input("Spark skill? (1=yes,0=no): "))
aws_skill = int(input("AWS skill? (1=yes,0=no): "))
excel_skill = int(input("Excel skill? (1=yes,0=no): "))

# ---------- JOB ROLE ----------
print("\nAvailable job roles:")
print(list(le_job.classes_))
job = input("Enter job role: ").lower().strip()

if job not in le_job.classes_:
    print("Unknown job role! Using default:", le_job.classes_[0])
    job = le_job.classes_[0]

job_encoded = le_job.transform([job])[0]

# ---------- SENIORITY ----------
print("\nAvailable seniority levels:")
print(list(le_sen.classes_))
sen = input("Enter seniority: ").lower().strip()

if sen not in le_sen.classes_:
    print("Unknown seniority! Using default:", le_sen.classes_[0])
    sen = le_sen.classes_[0]

sen_encoded = le_sen.transform([sen])[0]

# ---------- PREDICTION ----------
prediction = model.predict([[rating,python_skill,spark_skill,aws_skill,excel_skill,sen_encoded,job_encoded]])

print("\n======================================")
print("Predicted Salary: $", round(prediction[0],2))
print("======================================")


Loading dataset...

Model trained successfully!
Model Accuracy (R2 Score): 0.651


Available job roles:
['analyst', 'data engineer', 'data scientist', 'director', 'manager', 'mle', 'na']

Available seniority levels:
['jr', 'na', 'senior']

Predicted Salary: $ 113.23


