
<br>
Day 2: Pandas Basics<br>
Data manipulation and analysis<br>


In [None]:
import pandas as pd
import numpy as np

In [None]:
print("=" * 70)
print(" PANDAS: DATA MANIPULATION")
print("=" * 70)

============================================================================<br>
1. CREATING DATAFRAMES (30 min)<br>
============================================================================

In [None]:
print("\n1. CREATING DATAFRAMES")
print("-" * 70)

From dictionary

In [None]:
data = {
    "name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
    "age": [25, 30, 35, 28, 32],
    "city": ["Harare", "Bulawayo", "Harare", "Gweru", "Bulawayo"],
    "salary": [50000, 60000, 75000, 55000, 70000],
}

In [None]:
df = pd.DataFrame(data)
print("DataFrame from dictionary:")
print(df)
print(f"\nShape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Data types:\n{df.dtypes}")

From NumPy array

In [None]:
arr = np.random.randint(0, 100, (5, 3))
df_numpy = pd.DataFrame(arr, columns=["A", "B", "C"])
print("\nDataFrame from NumPy:")
print(df_numpy)

============================================================================<br>
2. BASIC OPERATIONS (30 min)<br>
============================================================================

In [None]:
print("\n2. BASIC OPERATIONS")
print("-" * 70)

Info about DataFrame

In [None]:
print("\nDataFrame info:")
print(df.info())

In [None]:
print("\nFirst 3 rows:")
print(df.head(3))

In [None]:
print("\nLast 2 rows:")
print(df.tail(2))

In [None]:
print("\nDescriptive statistics:")
print(df.describe())

Selecting columns

In [None]:
print("\nSelect 'name' column:")
print(df["name"])

In [None]:
print("\nSelect multiple columns:")
print(df[["name", "salary"]])

Selecting rows

In [None]:
print("\nFirst row:")
print(df.iloc[0])  # by index

In [None]:
print("\nRows 1-3:")
print(df.iloc[1:4])

Conditional selection

In [None]:
print("\nPeople older than 30:")
print(df[df["age"] > 30])

In [None]:
print("\nPeople from Harare:")
print(df[df["city"] == "Harare"])

In [None]:
print("\nPeople from Harare with salary > 50000:")
print(df[(df["city"] == "Harare") & (df["salary"] > 50000)])

============================================================================<br>
3. DATA MANIPULATION (30 min)<br>
============================================================================

In [None]:
print("\n3. DATA MANIPULATION")
print("-" * 70)

Add new column

In [None]:
df["bonus"] = df["salary"] * 0.1
print("Added bonus column:")
print(df)

Modify existing column

In [None]:
df["age"] = df["age"] + 1
print("\nIncreased age by 1:")
print(df[["name", "age"]])

Delete column

In [None]:
df_copy = df.copy()
df_copy = df_copy.drop("bonus", axis=1)
print("\nAfter dropping bonus:")
print(df_copy.columns.tolist())

Sorting

In [None]:
print("\nSort by salary (descending):")
print(df.sort_values("salary", ascending=False))

In [None]:
print("\nSort by city then age:")
print(df.sort_values(["city", "age"]))

============================================================================<br>
4. GROUPBY & AGGREGATION (30 min) - LIKE SPSS!<br>
============================================================================

In [None]:
print("\n4. GROUPBY & AGGREGATION (Like SPSS!)")
print("-" * 70)

Group by city

In [None]:
print("\nAverage salary by city:")
print(df.groupby("city")["salary"].mean())

In [None]:
print("\nMultiple aggregations:")
print(df.groupby("city").agg({"salary": ["mean", "min", "max"], "age": "mean"}))

In [None]:
print("\nCount by city:")
print(df["city"].value_counts())

============================================================================<br>
5. HANDLING MISSING DATA (20 min)<br>
============================================================================

In [None]:
print("\n5. HANDLING MISSING DATA")
print("-" * 70)

Create data with missing values

In [None]:
data_missing = {
    "A": [1, 2, np.nan, 4, 5],
    "B": [5, np.nan, np.nan, 8, 9],
    "C": [10, 11, 12, 13, 14],
}
df_missing = pd.DataFrame(data_missing)
print("DataFrame with missing values:")
print(df_missing)

In [None]:
print("\nCheck for missing values:")
print(df_missing.isnull())

In [None]:
print("\nCount missing values per column:")
print(df_missing.isnull().sum())

Drop rows with any missing values

In [None]:
print("\nDrop rows with any NaN:")
print(df_missing.dropna())

Fill missing values

In [None]:
print("\nFill NaN with 0:")
print(df_missing.fillna(0))

In [None]:
print("\nFill NaN with mean:")
print(df_missing.fillna(df_missing.mean()))

============================================================================<br>
6. READING & WRITING FILES (10 min)<br>
============================================================================

In [None]:
print("\n6. FILE I/O")
print("-" * 70)

Save to CSV

In [None]:
df.to_csv("people_data.csv", index=False)
print("✅ Saved to people_data.csv")

Read from CSV

In [None]:
df_read = pd.read_csv("people_data.csv")
print("\nRead from CSV:")
print(df_read.head())

In [None]:
print("\n" + "=" * 70)
print(" ✅ PANDAS BASICS COMPLETE!")
print("=" * 70)