# Data Aggregations

In [1]:
import pandas as pd

data = {
    "city": ["London", "London", "Tel Aviv", "Tel Aviv", "New York", "New York", "Paris", "Paris", "London"],
    "gender": ["F", "M", "M", "F", "M", "F", "F", "M", "F"],
    "age": [28, 34, 30, 26, 41, 35, 29, 32, 23],
    "salary": [5000, 6200, 7000, 6600, 7200, 6800, 5400, 5600, 4800],
    "skills": [
        ["Python", "SQL"],
        ["JavaScript", "React"],
        ["Python", "AWS"],
        ["Python", "Pandas"],
        ["Excel", "PowerBI"],
        ["Python", "SQL"],
        ["Python", "Excel"],
        ["Java", "Spring"],
        ["Python", "Flask"]
    ]
}

df = pd.DataFrame(data)
print(df)


       city gender  age  salary               skills
0    London      F   28    5000        [Python, SQL]
1    London      M   34    6200  [JavaScript, React]
2  Tel Aviv      M   30    7000        [Python, AWS]
3  Tel Aviv      F   26    6600     [Python, Pandas]
4  New York      M   41    7200     [Excel, PowerBI]
5  New York      F   35    6800        [Python, SQL]
6     Paris      F   29    5400      [Python, Excel]
7     Paris      M   32    5600       [Java, Spring]
8    London      F   23    4800      [Python, Flask]


# Question 1: Basic Aggregation, Average
Find the average salary per city and sort descending

In [2]:
df.groupby(['city'])['salary'].mean().sort_values(ascending=False)

city
New York    7000.000000
Tel Aviv    6800.000000
Paris       5500.000000
London      5333.333333
Name: salary, dtype: float64

### Question 2: Conditional & Multi-Aggregation

For each city, calculate:

1. The **average salary of females** (`gender == 'F'`)  
2. The **number of females in that city**  

Return a DataFrame with columns: 

- `city`  
- `avg_female_salary`  
- `num_people`  


In [3]:
df[df['gender']=='F'].groupby(['city']).agg({'salary':'mean', 'gender':'count'}).rename(columns={'gender':'num_females'})

Unnamed: 0_level_0,salary,num_females
city,Unnamed: 1_level_1,Unnamed: 2_level_1
London,4900.0,2
New York,6800.0,1
Paris,5400.0,1
Tel Aviv,6600.0,1


## Question 3: explode the skills list and count how many people have each skill per city.

In [18]:
df_exploded = df.explode(['skills'])
df_exploded.groupby(['city', 'skills'])['gender'].count().reset_index(name='num_people')
df.head()

Unnamed: 0,city,gender,age,salary,skills
0,London,F,28,5000,"[Python, SQL]"
1,London,M,34,6200,"[JavaScript, React]"
2,Tel Aviv,M,30,7000,"[Python, AWS]"
3,Tel Aviv,F,26,6600,"[Python, Pandas]"
4,New York,M,41,7200,"[Excel, PowerBI]"


# Question 4: Weighted average salary per city, weighted by age (so older people contribute more to the average)

In [23]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0.5,1))
input_df = pd.DataFrame(df['age'].values, columns=['age_scaled'])
df_scaled = pd.DataFrame(scaler.fit_transform(input_df), columns=['age_scaled'])
scaled_salaries = df['salary'] * df_scaled['age_scaled']
city_and_scaled_salary = pd.DataFrame({'city':df['city'], 'scaled_salaries':scaled_salaries})
city_and_scaled_salary.groupby(['city'])['scaled_salaries'].mean().reset_index()

#or
df.groupby('city').apply(lambda x: (x['salary']*x['age']).sum()/x['age'].sum()).reset_index()

  df.groupby('city').apply(lambda x: (x['salary']*x['age']).sum()/x['age'].sum()).reset_index()


Unnamed: 0,city,0
0,London,5425.882353
1,New York,7015.789474
2,Paris,5504.918033
3,Tel Aviv,6814.285714


In [25]:
df.sample(5)

Unnamed: 0,city,gender,age,salary,skills
6,Paris,F,29,5400,"[Python, Excel]"
1,London,M,34,6200,"[JavaScript, React]"
7,Paris,M,32,5600,"[Java, Spring]"
5,New York,F,35,6800,"[Python, SQL]"
2,Tel Aviv,M,30,7000,"[Python, AWS]"


In [34]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Perform label encoding
data=df.copy()
data['gender'] = label_encoder.fit_transform(data['gender'])
pd.get_dummies(data, columns=['city'], drop_first=True, dtype=int)

Unnamed: 0,gender,age,salary,skills,city_New York,city_Paris,city_Tel Aviv
0,0,28,5000,"[Python, SQL]",0,0,0
1,1,34,6200,"[JavaScript, React]",0,0,0
2,1,30,7000,"[Python, AWS]",0,0,1
3,0,26,6600,"[Python, Pandas]",0,0,1
4,1,41,7200,"[Excel, PowerBI]",1,0,0
5,0,35,6800,"[Python, SQL]",1,0,0
6,0,29,5400,"[Python, Excel]",0,1,0
7,1,32,5600,"[Java, Spring]",0,1,0
8,0,23,4800,"[Python, Flask]",0,0,0
