In [None]:
!pip install scipy pandas numpy matplotlib scikit-learn fsspec huggingface_hub

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.linear_model as lm

# Load the data
df = pd.read_csv("hf://datasets/hugginglearners/data-science-job-salaries/ds_salaries.csv")

In [None]:
# 3 Impact of Company Size on Salary

sizes = ['S', 'M', 'L']
salary_data = [df[df['company_size'] == size]['salary_in_usd'] for size in sizes]

fig, ax = plt.subplots(figsize=(8, 4))

bp = ax.boxplot(salary_data, labels=sizes, patch_artist=True)

colors = ['lightblue', 'lightgreen', 'lightpink']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)

ax.set_title('Salary by Company Size', fontsize=16)
ax.set_xlabel('Company Size', fontsize=12)
ax.set_ylabel('Salary (USD)', fontsize=12)

ax.yaxis.grid(True, linestyle='--', color='grey')

plt.show()

In [None]:
# 5 Interactions Between Relationships
df_5 = df
df_5['salary_in_usd'] = pd.to_numeric(df_5['salary_in_usd'], errors='coerce')
df_5 = df_5.dropna(subset=['job_title', 'company_size', 'salary_in_usd'])

pivot_table = pd.pivot_table(df_5, 
                             values='salary_in_usd', 
                             index='job_title', 
                             columns='company_size', 
                             aggfunc='mean')

pivot_table = pivot_table.round(-3)
pivot_table['mean'] = pivot_table.mean(axis=1)
pivot_table = pivot_table.sort_values('mean', ascending=False)
pivot_table = pivot_table.drop('mean', axis=1)
print(pivot_table.head(23).to_string())

summary_stats = df.groupby('company_size')['salary_in_usd'].agg(['mean', 'median', 'std', 'count'])
summary_stats = summary_stats.round(0)

print("\nSummary Statistics by Company Size:")
print(summary_stats.to_string())

exp_summary = df.groupby(['experience_level', 'company_size'])['salary_in_usd'].mean().unstack()
exp_summary = exp_summary.round(-3)

print("\nMean Salary by Experience Level and Company Size:")
print(exp_summary.to_string())