 **Salary Prediction Project**    
 **Done by:** shahed shahrouri  
 **202311350**

* **1.** ***Import Libraries:***

In [None]:
import pandas as pd
import requests
import threading, json, multiprocessing, re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression


* **2.** ***Load Dataset:***

In [None]:
df = pd.read_csv("ds_salaries.csv")
df.head()


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


* **3.** ***Columns Overview & Target (Salary) Selection:***

In [None]:
possible_salary_cols = [
    "salary_in_usd", "salary", "salary_usd", "Salary", "Salary_in_USD",
    "unconverted_salary", "salary_in_local_currency"
]

salary_col = next((c for c in possible_salary_cols if c in df.columns), None)
if salary_col is None:
    raise ValueError("No salary column found.")

salary_col


'salary_in_usd'

* **4.** ***Data Cleaning & Basic Feature Engineering:***

In [None]:
df.drop_duplicates(inplace=True)
df[salary_col] = df[salary_col].fillna(df[salary_col].mean())
df["salary_vs_average"] = df[salary_col] - df[salary_col].mean()

df.describe()


Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio,salary_vs_average
count,2584.0,2584.0,2584.0,2584.0,2584.0
mean,2022.301084,210365.3,133409.280186,50.483746,2.162514e-12
std,0.749179,808037.5,67136.837329,48.163707,67136.84
min,2020.0,6000.0,5132.0,0.0,-128277.3
25%,2022.0,90000.0,84975.0,0.0,-48434.28
50%,2022.0,134630.0,130000.0,50.0,-3409.28
75%,2023.0,182562.5,175000.0,100.0,41590.72
max,2023.0,30400000.0,450000.0,100.0,316590.7


* **5.**  ***Feature Selection & Model Training (Scikit-Learn):***

In [None]:
numeric_cols = [c for c in ["remote_ratio", "work_year"] if c in df.columns]
categorical_cols = [c for c in ["experience_level", "employment_type", "company_location", "company_size"] if c in df.columns]

X = df[numeric_cols + categorical_cols]
y = df[salary_col]

numeric_cols, categorical_cols


(['remote_ratio', 'work_year'],
 ['experience_level', 'employment_type', 'company_location', 'company_size'])

* ***6.Model training (Scikit-Learn)***

In [None]:
num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="mean")),
    ("scale", StandardScaler())
])

cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", num_pipe, numeric_cols),
    ("cat", cat_pipe, categorical_cols)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = Pipeline([
    ("prep", preprocess),
    ("reg", LinearRegression())
])

model.fit(X_train, y_train)
r2 = model.score(X_test, y_test)
r2


0.3598108273825187

* **7.** ***JSON save & load:***

In [None]:
sample_cols = [c for c in ["job_title", "company_location", salary_col] if c in df.columns]
sample = df[sample_cols].head(10).to_dict(orient="records")

with open("salaries_sample.json", "w", encoding="utf-8") as f:
    json.dump(sample, f, indent=4)

with open("salaries_sample.json", "r", encoding="utf-8") as f:
    restored = json.load(f)

json_df = pd.DataFrame(restored)
json_df


Unnamed: 0,job_title,company_location,salary_in_usd
0,Principal Data Scientist,ES,85847
1,ML Engineer,US,30000
2,ML Engineer,US,25500
3,Data Scientist,CA,175000
4,Data Scientist,CA,120000
5,Applied Scientist,US,222200
6,Applied Scientist,US,136000
7,Data Scientist,CA,219000
8,Data Scientist,CA,141000
9,Data Scientist,US,147100


* **8.** ***Web scraping + threads:***

In [None]:
urls = [
    "https://www.exchangerate-api.com/",
    "https://www.xe.com/currencyconverter/",
    "https://www.investing.com/currencies/usd-jod"
]

scraped = []

def fetch(url):
    try:
        r = requests.get(url, timeout=10)
        m = re.search(r"<title>(.*?)</title>", r.text, flags=re.IGNORECASE | re.DOTALL)
        title = m.group(1).strip() if m else None
        scraped.append({"url": url, "title": title})
    except Exception:
        scraped.append({"url": url, "title": None})

threads = [threading.Thread(target=fetch, args=(u,)) for u in urls]
for t in threads:
    t.start()
for t in threads:
    t.join()

scraped


[{'url': 'https://www.exchangerate-api.com/',
  'title': 'ExchangeRate-API - Free &amp; Pro Currency Converter API'},
 {'url': 'https://www.xe.com/currencyconverter/', 'title': None},
 {'url': 'https://www.investing.com/currencies/usd-jod', 'title': None}]

* **9.** ***Multiprocessing on Salary Statistics:***

In [None]:
def avg():
    print("Average salary:", df[salary_col].mean())

def mx():
    print("Max salary:", df[salary_col].max())

def mn():
    print("Min salary:", df[salary_col].min())

def run_processes():
    procs = []
    for func in [avg, mx, mn]:
        p = multiprocessing.Process(target=func)
        p.start()
        procs.append(p)
    for p in procs:
        p.join()

run_processes()


Average salary:Max salary:  133409.28018575851450000Min salary:
 
5132
