In [None]:
# Import Python libraries
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [None]:
# Set all float values read by pandas to have 2 decimal places to avoid scientific notations in future calculations
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
# Read the data into a data frame
df = pd.read_csv("/kaggle/input/world-population-dataset/world_population.csv")

# Display the data
df

In [None]:
# Get info about the table (data frame) including column names,types and counts
df.info()

In [None]:
# Show only columns of data type 'number' including integer, float, etc.
df.select_dtypes(include="number")

In [None]:
# Show only columns of data type 'object'
df.select_dtypes(include="object")

In [None]:
# Summary statistics of the data frame
df.describe()

**Get the number of missing/null values for each column**

In [None]:
# Number of null values in each column
df.isnull().sum()

Count the number of unique values in each column

In [None]:
# Number of unique values in each column
df.nunique()

Sort by 2022 population

In [None]:
# Sort by 2022 population highest to lowest
df.sort_values(by = "2022 Population", ascending = False).head(10)

In [None]:
# Correlation between numeric values of the data frame
df.corr(numeric_only=True)

In [None]:
# Use seaborn and pyplot to get a correlation chart

sb.heatmap(df.corr(numeric_only=True),annot = True)
plt.rcParams['figure.figsize'] = (10,7)
plt.show()

In [None]:
# Mean of all numeric values aggregated by 'Continent'
df.groupby("Continent").mean(numeric_only = True).sort_values(by = "World Population Percentage", ascending = False)

In [None]:
# Continent = Oceania
df[df['Continent'].str.contains('Oceania')]

## A. Visualising mean population of continents over the years using a line chart

In [None]:
# (OPTION 1) Create a new data frame with only the mean population per continent
df2_a = df.groupby('Continent')[df.columns[5:13]].mean().sort_values(by = "2022 Population", ascending = False)
df2_a

In [None]:
df.columns

In [None]:
# (OPTION 2) Create a new data frame with only the mean population per continent, using columns from column_names manually sorting them yearly
df2_b = df.groupby('Continent')[['1970 Population', '1980 Population', '1990 Population',
       '2000 Population', '2010 Population', '2015 Population',
       '2020 Population', '2022 Population']].mean().sort_values(by='2022 Population', ascending = False)
df2_b

In [None]:
# Transpose the data frame for the visualisations to have populations in the y-axis
df3 = df2_b.transpose()
df3

In [None]:
# Line Graph of mean population of continents over the years
df3.plot(figsize=(15,10))

## B. Finding outliers in the original country populations data frame using a box plot

In [None]:
# Create a new data frame to only include country population columns
df4 = df[['1970 Population', '1980 Population', '1990 Population',
       '2000 Population', '2010 Population', '2015 Population',
       '2020 Population', '2022 Population']].sort_values(by='2022 Population', ascending = False)
df4

In [None]:
df4.boxplot(figsize=(20,10))