# DataFrames: 2nd Part

In this I will explore more functionalities of DataFrames
* Filtering rows
* Adding new columns and overriding the .csv file
* Removing columns
* Renaming columns
* Indexing and selecting data
* Sorting data
* Handling missing data
* Grouping data
* Simple exercise to find row data based on user input

In [2]:
# Import necessary libraries
import pandas as pd

# Create a sample DataFrame
df = pd.read_csv('data/world_countries.csv')
# Filter rows where COUNTRY GPD is greater than 60 Billion
print(f"Countries with GDP greater than 60 Billion: \n{df[df['GDP (BILLIONS)'] > 60]}\n")

Countries with GDP greater than 60 Billion: 
               COUNTRY  GDP (BILLIONS) CODE
20              Angola      131.400000  AGO
21             Hungary      129.700000  HUN
22             Morocco      112.600000  MAR
23             Ecuador      100.500000  ECU
24            Slovakia       99.750000  SVK
25         Puerto Rico       93.520000  PRI
26               Japan       80.681931  JPN
27             Germany       80.681931  DEU
28       United States       80.681931  USA
29               China       80.681931  CHN
30              Turkey       80.681931  TUR
31           Indonesia       80.681931  IDN
32              Mexico       80.681931  MEX
33         Netherlands       80.681931  NLD
34         Switzerland       80.681931  CHE
35        Saudi Arabia       80.681931  SAU
36              France       80.681931  FRA
37      United Kingdom       80.681931  GBR
38              Brazil       80.681931  BRA
39               Italy       80.681931  ITA
40              Russia       80

In [3]:
# Add a new column 'High GDP' that indicates if GDP is greater than 100 Billion (imaginary threshold)
df['GDP_Category'] = df['GDP (BILLIONS)'].apply(lambda x: 'High' if x > 100 else 'Low')
print(f"DataFrame with new column 'GDP_Category':\n{df.head()}\n")

# merge (override) the added col. with the.csv file
df.to_csv("data/world_countries.csv", index=False)

DataFrame with new column 'GDP_Category':
       COUNTRY  GDP (BILLIONS) CODE GDP_Category
0  Philippines       30.224259  PHL          Low
1      Finland       30.224259  FIN          Low
2        Chile       30.224259  CHL          Low
3       Greece       30.224259  GRC          Low
4      Ireland       30.224259  IRL          Low



In [4]:
# Remove the 'GDP_Category' column
df = df.drop(columns=['GDP_Category'])
print(f"DataFrame after removing 'GDP_Category' column:\n{df.head()}\n")

DataFrame after removing 'GDP_Category' column:
       COUNTRY  GDP (BILLIONS) CODE
0  Philippines       30.224259  PHL
1      Finland       30.224259  FIN
2        Chile       30.224259  CHL
3       Greece       30.224259  GRC
4      Ireland       30.224259  IRL



In [5]:
# Rename the 'COUNTRY' column to 'NATION'
df = df.rename(columns={'COUNTRY': 'NATION'})
print(f"DataFrame after renaming 'COUNTRY' to 'NATION':\n{df.head()}\n")

DataFrame after renaming 'COUNTRY' to 'NATION':
        NATION  GDP (BILLIONS) CODE
0  Philippines       30.224259  PHL
1      Finland       30.224259  FIN
2        Chile       30.224259  CHL
3       Greece       30.224259  GRC
4      Ireland       30.224259  IRL



In [6]:
# Set 'NATION' as the index of the DataFrame
df = df.set_index('NATION')
print(f"DataFrame with 'NATION' as index:\n{df.head()}\n")

DataFrame with 'NATION' as index:
             GDP (BILLIONS) CODE
NATION                          
Philippines       30.224259  PHL
Finland           30.224259  FIN
Chile             30.224259  CHL
Greece            30.224259  GRC
Ireland           30.224259  IRL



In [7]:
# Sort the DataFrame by 'GDP (BILLIONS)' in descending order
df = df.sort_values(by='GDP (BILLIONS)', ascending=False)
print(f"DataFrame sorted by 'GDP (BILLIONS)' in descending order:\n{df.head()}\n")

DataFrame sorted by 'GDP (BILLIONS)' in descending order:
          GDP (BILLIONS) CODE
NATION                       
Angola            131.40  AGO
Hungary           129.70  HUN
Morocco           112.60  MAR
Ecuador           100.50  ECU
Slovakia           99.75  SVK



In [8]:
# Handle missing data by filling NaN values with the mean of the column

# For demonstration, let's introduce some NaN values
import numpy as np

# Introduce NaN values in the first 20 rows of 'GDP (BILLIONS)'
df.iloc[:20, df.columns.get_loc('GDP (BILLIONS)')] = np.nan
print(f"DataFrame with introduced NaN values:\n{df.head()}\n")

# Fill missing values with the mean of each numeric column
df = df.fillna(df.mean(numeric_only=True))
print(f"DataFrame after handling missing data:\n{df.head()}\n")

# To fill only one columnâ€™s NaN values (not the whole DataFrame), use:
# df['GDP (BILLIONS)'].fillna(df['GDP (BILLIONS)'].mean(), inplace=True)

DataFrame with introduced NaN values:
          GDP (BILLIONS) CODE
NATION                       
Angola               NaN  AGO
Hungary              NaN  HUN
Morocco              NaN  MAR
Ecuador              NaN  ECU
Slovakia             NaN  SVK

DataFrame after handling missing data:
          GDP (BILLIONS) CODE
NATION                       
Angola         24.320636  AGO
Hungary        24.320636  HUN
Morocco        24.320636  MAR
Ecuador        24.320636  ECU
Slovakia       24.320636  SVK



In [9]:
# Group the DataFrame by 'GDP_Category' status and calculate the mean GDP for each group
# First, we need to re-add the 'GDP_Category' column for grouping
df['GDP_Category'] = df['GDP (BILLIONS)'].apply(lambda x: 'High' if x > 100 else 'Low')
grouped_df = df.groupby('GDP_Category').mean(numeric_only=True)
print(f"Grouped DataFrame by 'GDP_Category' level:\n{grouped_df}\n")

Grouped DataFrame by 'GDP_Category' level:
              GDP (BILLIONS)
GDP_Category                
Low                24.320636



In [10]:
# Simple exercise: Find and display data for a country based on user input (COUNTRY or CODE)
# First reset index as we changed it before
df = df.reset_index()
# And re-rename 'NATION' back to 'COUNTRY' as we changed it before
df = df.rename(columns={'NATION': 'COUNTRY'})
user_input = input("Enter a COUNTRY name or CODE to find its data: ")
result = df[(df['COUNTRY'].str.lower() == user_input.lower()) | (df['CODE'].str.lower() == user_input.lower())]
try:
  if not result.empty:
    print(f"Data for {user_input}:\n{result}\n")
  else:
    print(f"No data found for {user_input}.\n")
except Exception as e:
  print(f"An error occurred: {e}\n")

Data for PAK:
      COUNTRY  GDP (BILLIONS) CODE GDP_Category
102  Pakistan       30.224259  PAK          Low



In [11]:
# Remove the changes that been made to the original .csv file
if 'GDP_Category' in df.columns:
    df = df.drop(columns=['GDP_Category'])
df.to_csv("data/world_countries.csv", index=False)