# DataFrames: 2nd Part

In this I will explore more functionalities of DataFrames
* Filtering rows
* Adding new columns and overriding the .csv file
* Removing columns
* Renaming columns
* Indexing and selecting data
* Sorting data
* Handling missing data
* Grouping data
* Simple exercise to find row data based on user input

In [12]:
# Import necessary libraries
import pandas as pd

# Create a sample DataFrame
df = pd.read_csv('data/world_countries.csv')
# Filter rows where COUNTRY GPD is greater than 60 Billion
print(f"Countries with GDP greater than 60 Billion: \n{df[df['GDP (BILLIONS)'] > 60]}\n")

Countries with GDP greater than 60 Billion: 
               COUNTRY  GDP (BILLIONS) CODE
20         Philippines      284.600000  PHL
21             Finland      276.300000  FIN
22               Chile      264.100000  CHL
23              Greece      246.400000  GRC
24             Ireland      245.800000  IRL
25            Pakistan      237.500000  PAK
26                Iraq      232.200000  IRQ
27            Portugal      228.200000  PRT
28             Algeria      227.800000  DZA
29          Kazakhstan      225.600000  KAZ
30               Qatar      212.000000  QAT
31           Venezuela      209.200000  VEN
32                Peru      208.200000  PER
33      Czech Republic      205.600000  CZE
34         New Zealand      201.000000  NZL
35             Romania      199.000000  ROU
36             Vietnam      187.800000  VNM
37          Bangladesh      186.600000  BGD
38              Kuwait      179.300000  KWT
39             Ukraine      134.900000  UKR
40              Angola      131

In [13]:
# Add a new column 'High GDP' that indicates if GDP is greater than 100 Billion (imaginary threshold)
df['GDP_Category'] = df['GDP (BILLIONS)'].apply(lambda x: 'High' if x > 100 else 'Low')
print(f"DataFrame with new column 'GDP_Category':\n{df.head()}\n")

# merge (override) the added col. with the.csv file
df.to_csv("data/world_countries.csv", index=False)

DataFrame with new column 'GDP_Category':
     COUNTRY  GDP (BILLIONS) CODE GDP_Category
0    Nigeria       47.285587  NGA          Low
1     Sweden       47.285587  SWE          Low
2     Poland       47.285587  POL          Low
3  Argentina       47.285587  ARG          Low
4     Taiwan       47.285587  TWN          Low



In [14]:
# Remove the 'GDP_Category' column
df = df.drop(columns=['GDP_Category'])
print(f"DataFrame after removing 'GDP_Category' column:\n{df.head()}\n")

DataFrame after removing 'GDP_Category' column:
     COUNTRY  GDP (BILLIONS) CODE
0    Nigeria       47.285587  NGA
1     Sweden       47.285587  SWE
2     Poland       47.285587  POL
3  Argentina       47.285587  ARG
4     Taiwan       47.285587  TWN



In [15]:
# Rename the 'COUNTRY' column to 'NATION'
df = df.rename(columns={'COUNTRY': 'NATION'})
print(f"DataFrame after renaming 'COUNTRY' to 'NATION':\n{df.head()}\n")

DataFrame after renaming 'COUNTRY' to 'NATION':
      NATION  GDP (BILLIONS) CODE
0    Nigeria       47.285587  NGA
1     Sweden       47.285587  SWE
2     Poland       47.285587  POL
3  Argentina       47.285587  ARG
4     Taiwan       47.285587  TWN



In [16]:
# Set 'NATION' as the index of the DataFrame
df = df.set_index('NATION')
print(f"DataFrame with 'NATION' as index:\n{df.head()}\n")

DataFrame with 'NATION' as index:
           GDP (BILLIONS) CODE
NATION                        
Nigeria         47.285587  NGA
Sweden          47.285587  SWE
Poland          47.285587  POL
Argentina       47.285587  ARG
Taiwan          47.285587  TWN



In [17]:
# Sort the DataFrame by 'GDP (BILLIONS)' in descending order
df = df.sort_values(by='GDP (BILLIONS)', ascending=False)
print(f"DataFrame sorted by 'GDP (BILLIONS)' in descending order:\n{df.head()}\n")

DataFrame sorted by 'GDP (BILLIONS)' in descending order:
             GDP (BILLIONS) CODE
NATION                          
Philippines           284.6  PHL
Finland               276.3  FIN
Chile                 264.1  CHL
Greece                246.4  GRC
Ireland               245.8  IRL



In [18]:
# Handle missing data by filling NaN values with the mean of the column

# For demonstration, let's introduce some NaN values
import numpy as np

# Introduce NaN values in the first 20 rows of 'GDP (BILLIONS)'
df.iloc[:20, df.columns.get_loc('GDP (BILLIONS)')] = np.nan
print(f"DataFrame with introduced NaN values:\n{df.head()}\n")

# Fill missing values with the mean of each numeric column
df = df.fillna(df.mean(numeric_only=True))
print(f"DataFrame after handling missing data:\n{df.head()}\n")

# To fill only one columnâ€™s NaN values (not the whole DataFrame), use:
# df['GDP (BILLIONS)'].fillna(df['GDP (BILLIONS)'].mean(), inplace=True)

DataFrame with introduced NaN values:
             GDP (BILLIONS) CODE
NATION                          
Philippines             NaN  PHL
Finland                 NaN  FIN
Chile                   NaN  CHL
Greece                  NaN  GRC
Ireland                 NaN  IRL

DataFrame after handling missing data:
             GDP (BILLIONS) CODE
NATION                          
Philippines       30.224259  PHL
Finland           30.224259  FIN
Chile             30.224259  CHL
Greece            30.224259  GRC
Ireland           30.224259  IRL



In [19]:
# Group the DataFrame by 'GDP_Category' status and calculate the mean GDP for each group
# First, we need to re-add the 'GDP_Category' column for grouping
df['GDP_Category'] = df['GDP (BILLIONS)'].apply(lambda x: 'High' if x > 100 else 'Low')
grouped_df = df.groupby('GDP_Category').mean(numeric_only=True)
print(f"Grouped DataFrame by 'GDP_Category' level:\n{grouped_df}\n")

Grouped DataFrame by 'GDP_Category' level:
              GDP (BILLIONS)
GDP_Category                
High              118.550000
Low                28.603603



In [20]:
# Simple exercise: Find and display data for a country based on user input (COUNTRY or CODE)
# First reset index as we changed it before
df = df.reset_index()
# And re-rename 'NATION' back to 'COUNTRY' as we changed it before
df = df.rename(columns={'NATION': 'COUNTRY'})
user_input = input("Enter a COUNTRY name or CODE to find its data: ")
result = df[(df['COUNTRY'].str.lower() == user_input.lower()) | (df['CODE'].str.lower() == user_input.lower())]
try:
  if not result.empty:
    print(f"Data for {user_input}:\n{result}\n")
  else:
    print(f"No data found for {user_input}.\n")
except Exception as e:
  print(f"An error occurred: {e}\n")

Data for PAK:
    COUNTRY  GDP (BILLIONS) CODE GDP_Category
5  Pakistan       30.224259  PAK          Low



In [21]:
# Remove the changes that been made to the original .csv file
if 'GDP_Category' in df.columns:
    df = df.drop(columns=['GDP_Category'])
df.to_csv("data/world_countries.csv", index=False)