# DataFrames: 2nd Part

In this I will explore more functionalities of DataFrames
* Filtering rows
* Adding new columns and overriding the .csv file
* Removing columns
* Renaming columns
* Indexing and selecting data
* Sorting data
* Handling missing data
* Grouping data
* Simple exercise to find row data based on user input

In [1]:
# Import necessary libraries
import pandas as pd

# Create a sample DataFrame
df = pd.read_csv('data/world_countries.csv')
# Filter rows where COUNTRY GPD is greater than 60 Billion
print(f"Countries with GDP greater than 60 Billion: \n{df[df['GDP (BILLIONS)'] > 60]}\n")

Countries with GDP greater than 60 Billion: 
               COUNTRY  GDP (BILLIONS) CODE
0        United States       80.681931  USA
1                China       80.681931  CHN
2                Japan       80.681931  JPN
3              Germany       80.681931  DEU
4               France       80.681931  FRA
..                 ...             ...  ...
73               Syria       64.700000  SYR
74  Dominican Republic       64.050000  DOM
75          Luxembourg       63.930000  LUX
76          Uzbekistan       63.080000  UZB
77               Kenya       62.720000  KEN

[78 rows x 3 columns]



In [2]:
# Add a new column 'High GDP' that indicates if GDP is greater than 100 Billion (imaginary threshold)
df['GDP_Category'] = df['GDP (BILLIONS)'].apply(lambda x: 'High' if x > 100 else 'Low')
print(f"DataFrame with new column 'GDP_Category':\n{df.head()}\n")

# merge (override) the added col. with .csv file
df.to_csv("data/world_countries.csv", index=False)

DataFrame with new column 'GDP_Category':
         COUNTRY  GDP (BILLIONS) CODE GDP_Category
0  United States       80.681931  USA          Low
1          China       80.681931  CHN          Low
2          Japan       80.681931  JPN          Low
3        Germany       80.681931  DEU          Low
4         France       80.681931  FRA          Low



In [3]:
# Remove the 'GDP_Category' column
df = df.drop(columns=['GDP_Category'])
print(f"DataFrame after removing 'GDP_Category' column:\n{df.head()}\n")

DataFrame after removing 'GDP_Category' column:
         COUNTRY  GDP (BILLIONS) CODE
0  United States       80.681931  USA
1          China       80.681931  CHN
2          Japan       80.681931  JPN
3        Germany       80.681931  DEU
4         France       80.681931  FRA



In [4]:
# Rename the 'COUNTRY' column to 'NATION'
df = df.rename(columns={'COUNTRY': 'NATION'})
print(f"DataFrame after renaming 'COUNTRY' to 'NATION':\n{df.head()}\n")

DataFrame after renaming 'COUNTRY' to 'NATION':
          NATION  GDP (BILLIONS) CODE
0  United States       80.681931  USA
1          China       80.681931  CHN
2          Japan       80.681931  JPN
3        Germany       80.681931  DEU
4         France       80.681931  FRA



In [5]:
# Set 'NATION' as the index of the DataFrame
df = df.set_index('NATION')
print(f"DataFrame with 'NATION' as index:\n{df.head()}\n")

DataFrame with 'NATION' as index:
               GDP (BILLIONS) CODE
NATION                            
United States       80.681931  USA
China               80.681931  CHN
Japan               80.681931  JPN
Germany             80.681931  DEU
France              80.681931  FRA



In [6]:
# Sort the DataFrame by 'GDP (BILLIONS)' in descending order
df = df.sort_values(by='GDP (BILLIONS)', ascending=False)
print(f"DataFrame sorted by 'GDP (BILLIONS)' in descending order:\n{df.head()}\n")

DataFrame sorted by 'GDP (BILLIONS)' in descending order:
           GDP (BILLIONS) CODE
NATION                        
Nigeria             594.3  NGA
Sweden              559.1  SWE
Poland              552.2  POL
Argentina           536.2  ARG
Taiwan              529.5  TWN



In [7]:
# Handle missing data by filling NaN values with the mean of the column

# For demonstration, let's introduce some NaN values
import numpy as np

# Introduce NaN values in the first 20 rows of 'GDP (BILLIONS)'
df.iloc[:20, df.columns.get_loc('GDP (BILLIONS)')] = np.nan
print(f"DataFrame with introduced NaN values:\n{df.head()}\n")

# Fill missing values with the mean of each numeric column
df = df.fillna(df.mean(numeric_only=True))
print(f"DataFrame after handling missing data:\n{df.head()}\n")

# To fill only one columnâ€™s NaN values (not the whole DataFrame), use:
# df['GDP (BILLIONS)'].fillna(df['GDP (BILLIONS)'].mean(), inplace=True)

DataFrame with introduced NaN values:
           GDP (BILLIONS) CODE
NATION                        
Nigeria               NaN  NGA
Sweden                NaN  SWE
Poland                NaN  POL
Argentina             NaN  ARG
Taiwan                NaN  TWN

DataFrame after handling missing data:
           GDP (BILLIONS) CODE
NATION                        
Nigeria         47.285587  NGA
Sweden          47.285587  SWE
Poland          47.285587  POL
Argentina       47.285587  ARG
Taiwan          47.285587  TWN



In [8]:
# Group the DataFrame by 'GDP_Category' status and calculate the mean GDP for each group
# First, we need to re-add the 'GDP_Category' column for grouping
df['GDP_Category'] = df['GDP (BILLIONS)'].apply(lambda x: 'High' if x > 100 else 'Low')
grouped_df = df.groupby('GDP_Category').mean(numeric_only=True)
print(f"Grouped DataFrame by 'GDP_Category' level:\n{grouped_df}\n")

Grouped DataFrame by 'GDP_Category' level:
              GDP (BILLIONS)
GDP_Category                
High              202.762500
Low                28.439901



In [None]:
# Simple exercise: Find and display data for a country based on user input (COUNTRY or CODE)
# First reset index as we changed it before
df = df.reset_index()
# And re-rename 'NATION' back to 'COUNTRY' as we changed it before
df = df.rename(columns={'NATION': 'COUNTRY'})
user_input = input("Enter a COUNTRY name or CODE to find its data: ")
result = df[(df['COUNTRY'].str.lower() == user_input.lower()) | (df['CODE'].str.lower() == user_input.lower())]
try:
  if not result.empty:
    print(f"Data for {user_input}:\n{result}\n")
  else:
    print(f"No data found for {user_input}.\n")
except Exception as e:
  print(f"An error occurred: {e}\n")

In [None]:
# Remove the changes that been made to the original .csv file
if 'GDP_Category' in df.columns:
    df = df.drop(columns=['GDP_Category'])
df.to_csv("data/world_countries.csv", index=False)