# Lesson 2 • Pandas Exercises

Goal: Load/inspect data, filter with masks, group & aggregate, handle missing values, and merge datasets.



In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 10)
np.random.seed(42)


In [None]:
# Utility: create a tiny CSV to read from
import io
csv_text = io.StringIO('''id,age,city,income
1,25,Tel Aviv,5400
2,31,Haifa,7200
3,29,Jerusalem,6600
4,41,Tel Aviv,8800
5,,Haifa,NaN
6,35,Jerusalem,7300
''')


## 1) Series & DataFrame Creation

In [None]:
# 1.1 Create a Series from a Python list of temperatures
temps = [21.5, 22.0, 20.1, 19.8]
# TODO: create `s`
s = ...
assert isinstance(s, pd.Series) and len(s)==4


In [None]:
# 1.2 Read the CSV above into a DataFrame `df`
df = ...
assert list(df.columns) == ['id','age','city','income'] and len(df)==6


In [None]:
# 1.3 Create a DataFrame from dict
data = {
    'name': ['Alice','Bob','Carol','Dan'],
    'score': [88,72,95,60],
    'passed': [True, True, True, False]
}
df2 = ...
assert {'name','score','passed'} <= set(df2.columns)


## 2) Indexing, Slicing, Filtering

In [None]:
# 2.1 Select rows with age > 30 from `df`
# TODO: set `older`
df = pd.read_csv(csv_text)
older = ...
assert (older['age'] > 30).all()


In [None]:
# 2.2 Select only 'city' and 'income' columns
cols = ...
assert list(cols.columns) == ['city','income']


In [None]:
# 2.3 Use .loc to select rows with id in [2,4] and columns ['age','income']
subset = ...
assert subset.shape == (2,2)


In [None]:
# 2.4 Use .iloc to select first 3 rows and the last 2 columns
iloc_part = ...
assert iloc_part.shape == (3,2)


## 3) Groupby & Aggregation

In [None]:
# 3.1 Group by 'city' compute mean income (ignore NaNs)
df = pd.read_csv(io.StringIO(csv_text.getvalue()))
mean_income_by_city = ...
assert isinstance(mean_income_by_city, pd.Series)


In [None]:
# 3.2 Group by city, compute count and max age
agg_stats = ...  # use .agg({'age':['count','max']})
assert 'count' in agg_stats.columns.get_level_values(1).tolist()


In [None]:
# 3.3 Create a new column 'income_k' = income / 1000 then group by city and compute mean
df = pd.read_csv(io.StringIO(csv_text.getvalue()))
...
mean_k = ...
assert isinstance(mean_k, pd.Series)


## 4) Sorting & Missing Values

In [None]:
# 4.1 Sort df by income descending (NaNs last)
df = pd.read_csv(io.StringIO(csv_text.getvalue()))
sorted_df = ...
assert sorted_df['income'].iloc[0] >= sorted_df['income'].iloc[1]


In [None]:
# 4.2 Count missing values per column
na_counts = ...
assert isinstance(na_counts, pd.Series) and 'age' in na_counts.index


In [None]:
# 4.3 Fill missing age with the mean age and drop rows where income is NaN
df = pd.read_csv(io.StringIO(csv_text.getvalue()))
df['age'] = ...
df = ...
assert df['age'].isna().sum() == 0 and df['income'].isna().sum() == 0


## 5) Merge / Join (Bonus)

In [None]:
# 5.1 Merge two DataFrames on 'id'
left  = pd.DataFrame({'id':[1,2,3], 'group':['A','B','A']})
right = pd.DataFrame({'id':[2,3,4], 'score':[90,75,60]})
# TODO: set `merged`
merged = ...
assert set(merged.columns) == {'id','group','score'} and len(merged)==2
