**Title**: Data Wrangling 4.2 Exercises  
**Author**: Ryan Weeks  
**Date**: 21 December 2024  
**Description**: These exercises focus on data manipulation and analysis using pandas and numpy. Tasks include loading, exploring, and querying data from CSV files like WHO_first9cols.csv, performing group-based aggregation, and handling missing values. Additionally, operations like concatenation, appending, and merging datasets are practiced, enhancing skills in managing and combining data efficiently.

## Loading and exploring the data

In [3]:
import pandas as pd

# Import Data
who_data = pd.read_csv(r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\WHO_first9cols.csv")

# Print results
print(who_data)

# Query number of rows
print("Number of rows:", who_data.shape[0])

# Print the column headers
print("Column headers:", who_data.columns.tolist())

# Print the data types
print("Data types:\n", who_data.dtypes)

# Print the index
print("Index:", who_data.index)

                Country  CountryID  Continent  Adolescent fertility rate (%)  \
0           Afghanistan          1          1                          151.0   
1               Albania          2          2                           27.0   
2               Algeria          3          3                            6.0   
3               Andorra          4          2                            NaN   
4                Angola          5          3                          146.0   
..                  ...        ...        ...                            ...   
197             Vietnam        198          6                           25.0   
198  West Bank and Gaza        199          1                            NaN   
199               Yemen        200          1                           83.0   
200              Zambia        201          3                          161.0   
201            Zimbabwe        202          3                          101.0   

     Adult literacy rate (%)  \
0      

## The 'Country' Column

In [5]:
country_col = who_data['Country']

# Print details
print("Data type:", country_col.dtype)
print("Shape:", country_col.shape)
print("Index:", country_col.index)
print("Values:\n", country_col.values)
print("Name:", country_col.name)

Data type: object
Shape: (202,)
Index: RangeIndex(start=0, stop=202, step=1)
Values:
 ['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan' 'Bolivia'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'Brunei Darussalam'
 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cambodia' 'Cameroon' 'Canada'
 'Cape Verde' 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia'
 'Comoros' 'Congo, Dem. Rep.' 'Congo, Rep.' 'Cook Islands' 'Costa Rica'
 "Cote d'Ivoire" 'Croatia' 'Cuba' 'Cyprus' 'Czech Republic' 'Denmark'
 'Djibouti' 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt'
 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Ethiopia' 'Fiji'
 'Finland' 'France' 'French Polynesia' 'Gabon' 'Gambia' 'Georgia'
 'Germany' 'Ghana' 'Greece' 'Grenada' 'Guatemala' 'Guinea' 'Guinea-Bissau'
 'Guyana' 'Haiti' 'Honduras' 'Hong Kong, 

## Random DataFrame

In [7]:
import numpy as np
import pandas as pd

# Create random DataFrame
import pandas as pd
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# Generate random data
data = {
    'Weather': np.random.choice(['Hot', 'Cold'], size=20),
    'Food': np.random.choice(['Pizza', 'Burger', 'Salad', 'Tacos'], size=20),
    'Price': np.random.uniform(5, 20, size=20),
    'Number': np.random.randint(1, 10, size=20)
}

# Create the DataFrame
df = pd.DataFrame(data)

# Display the first few rows
print(df.head())


  Weather    Food      Price  Number
0     Hot  Burger  14.177793       7
1    Cold   Pizza   7.092408       8
2     Hot  Burger   9.382170       3
3     Hot   Tacos  10.495428       1
4     Hot   Tacos  11.841050       4


In [8]:
# Group the data by the 'Weather' column
weather_group = df.groupby('Weather')

# Define a function to iterate through the groups
def iterate_weather_groups(grouped_data):
    for name, group in grouped_data:
        print(f"\nGroup: {name}")
        print(group)

# Call the function
iterate_weather_groups(weather_group)


Group: Cold
   Weather    Food      Price  Number
1     Cold   Pizza   7.092408       8
5     Cold  Burger  16.777639       2
9     Cold   Tacos   5.696756       6
14    Cold  Burger  19.484480       2
16    Cold   Tacos   9.569207       8
17    Cold   Pizza   6.465082       7
18    Cold   Pizza  15.263495       9

Group: Hot
   Weather    Food      Price  Number
0      Hot  Burger  14.177793       7
2      Hot  Burger   9.382170       3
3      Hot   Tacos  10.495428       1
4      Hot   Tacos  11.841050       4
6      Hot  Burger   7.995107       8
7      Hot  Burger  12.713517       4
8      Hot   Tacos  13.886219       2
10     Hot   Pizza  14.113173       6
11     Hot   Pizza   7.557862       4
12     Hot   Tacos   5.975774       6
13     Hot  Burger  19.233283       2
15     Hot   Pizza  17.125960       4
19     Hot   Salad  11.602287       8


In [9]:
# Print the first row for each group
print("First Row for Each Group:")
print(weather_group.first())

# Print the last row for each group
print("\nLast Row for Each Group:")
print(weather_group.last())

# Print the mean for each group
print("\nMean for Each Group:")
print(weather_group.mean(numeric_only=True))

First Row for Each Group:
           Food      Price  Number
Weather                           
Cold      Pizza   7.092408       8
Hot      Burger  14.177793       7

Last Row for Each Group:
          Food      Price  Number
Weather                          
Cold     Pizza  15.263495       9
Hot      Salad  11.602287       8

Mean for Each Group:
             Price    Number
Weather                     
Cold     11.478438  6.000000
Hot      12.007663  4.538462


In [10]:
# Group by 'Weather' and 'Food'
weather_food_group = df.groupby(['Weather', 'Food'])

# Perform aggregation explicitly on numeric columns
aggregated_data = weather_food_group.agg({
    'Price': ['mean', 'median'],  # Aggregate 'Price'
    'Number': ['mean', 'median']  # Aggregate 'Number'
})

print("\nAggregated Data by Weather and Food:")
print(aggregated_data)


Aggregated Data by Weather and Food:
                    Price               Number       
                     mean     median      mean median
Weather Food                                         
Cold    Burger  18.131060  18.131060  2.000000    2.0
        Pizza    9.606995   7.092408  8.000000    8.0
        Tacos    7.632981   7.632981  7.000000    7.0
Hot     Burger  12.700374  12.713517  4.800000    4.0
        Pizza   12.932332  14.113173  4.666667    4.0
        Salad   11.602287  11.602287  8.000000    8.0
        Tacos   10.549617  11.168239  3.250000    3.0


In [11]:
# Group the data by 'Weather' and 'Food'
weather_food_group = df.groupby(['Weather', 'Food'])

# Use agg() to calculate the mean and median for 'Price' and 'Number'
aggregated_data = weather_food_group.agg({
    'Price': ['mean', 'median'],
    'Number': ['mean', 'median']
})

# Rename columns for clarity
aggregated_data.columns = ['Price_Mean', 'Price_Median', 'Number_Mean', 'Number_Median']

# Print the aggregated data
print("Aggregated Data (Mean and Median for Price and Number):")
print(aggregated_data)

Aggregated Data (Mean and Median for Price and Number):
                Price_Mean  Price_Median  Number_Mean  Number_Median
Weather Food                                                        
Cold    Burger   18.131060     18.131060     2.000000            2.0
        Pizza     9.606995      7.092408     8.000000            8.0
        Tacos     7.632981      7.632981     7.000000            7.0
Hot     Burger   12.700374     12.713517     4.800000            4.0
        Pizza    12.932332     14.113173     4.666667            4.0
        Salad    11.602287     11.602287     8.000000            8.0
        Tacos    10.549617     11.168239     3.250000            3.0


In [12]:
# Select the first 3 rows
first_3_rows = df.head(3)

# Select the last 2 rows
last_2_rows = df.tail(2)

# Use pd.concat() to combine the first 3 rows with the original DataFrame
df_combined = pd.concat([df, first_3_rows])

# Use pd.concat() to combine the 3 rows and 2 rows
df_combined_with_last_2 = pd.concat([first_3_rows, last_2_rows])

# Print the results
print("DataFrame with first 3 rows appended:")
print(df_combined)

print("\nDataFrame with first 3 rows and last 2 rows appended:")
print(df_combined_with_last_2)

DataFrame with first 3 rows appended:
   Weather    Food      Price  Number
0      Hot  Burger  14.177793       7
1     Cold   Pizza   7.092408       8
2      Hot  Burger   9.382170       3
3      Hot   Tacos  10.495428       1
4      Hot   Tacos  11.841050       4
5     Cold  Burger  16.777639       2
6      Hot  Burger   7.995107       8
7      Hot  Burger  12.713517       4
8      Hot   Tacos  13.886219       2
9     Cold   Tacos   5.696756       6
10     Hot   Pizza  14.113173       6
11     Hot   Pizza   7.557862       4
12     Hot   Tacos   5.975774       6
13     Hot  Burger  19.233283       2
14    Cold  Burger  19.484480       2
15     Hot   Pizza  17.125960       4
16    Cold   Tacos   9.569207       8
17    Cold   Pizza   6.465082       7
18    Cold   Pizza  15.263495       9
19     Hot   Salad  11.602287       8
0      Hot  Burger  14.177793       7
1     Cold   Pizza   7.092408       8
2      Hot  Burger   9.382170       3

DataFrame with first 3 rows and last 2 rows appen

In [13]:
# Load Data
dest = pd.read_csv(r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\dest.csv")
tips = pd.read_csv(r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\tips.csv")

# Merge on EmpNr
merged = pd.merge(dest, tips, on='EmpNr')
print(merged)

# Join
joined = dest.join(tips.set_index('EmpNr'), on='EmpNr')
print(joined)

   EmpNr       Dest  Amount
0      5  The Hague    10.0
1      9  Rotterdam     5.0
   EmpNr       Dest  Amount
0      5  The Hague    10.0
1      3  Amsterdam     NaN
2      9  Rotterdam     5.0


In [14]:
# Select specific columns
subset = who_data[['Country', 'Net primary school enrolment ratio male (%)']].head(3)
print(subset)

# Check for missing values
missing_values = subset.isnull()
print("Missing values:\n", missing_values)

# Count NaNs
nan_count = subset.isna().sum()
print("NaN count:\n", nan_count)

# Print non-missing values
non_missing = subset.dropna()
print("Non-missing values:\n", non_missing)

# Replace missing with a scalar
filled_subset = subset.fillna(0)
print("Filled with 0:\n", filled_subset)

       Country  Net primary school enrolment ratio male (%)
0  Afghanistan                                          NaN
1      Albania                                         94.0
2      Algeria                                         96.0
Missing values:
    Country  Net primary school enrolment ratio male (%)
0    False                                         True
1    False                                        False
2    False                                        False
NaN count:
 Country                                        0
Net primary school enrolment ratio male (%)    1
dtype: int64
Non-missing values:
    Country  Net primary school enrolment ratio male (%)
1  Albania                                         94.0
2  Algeria                                         96.0
Filled with 0:
        Country  Net primary school enrolment ratio male (%)
0  Afghanistan                                          0.0
1      Albania                                         94.0
2      Alge