In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
url = '/kaggle/input/malaysia-population/population.csv'
population = pd.read_csv(url)

<h2>Analyze Population Dataframe</h2>
<p>Let's learn the various methods to analyze the given DataFrame</p><hr/>

In [None]:
# The head() function is used to get the first n rows. 
# By default value of n is 5
# It is useful for quickly testing the dataset.
population.head()

In [None]:
# The tail() function is used to get the last n rows. 
# By default value of n is 5
# It is useful for quickly testing the dataset.
population.tail()

In [None]:
# The info() function is used to print a concise summary of a DataFrame. 
# This method prints information about a DataFrame including
# 1. total number of rows (start index, end index)
# 2. total number of columns
# 3. data type of each column 
# 4. how many non-null count for each column
# 5. in this case the total entry is 17 and non-null count of each field is also 17
# 6. the dataset do not have null values
# 7. summary of data type => you have 4 integer columns and 1 non-number column
# 8. how much memory is being used in bytes
population.info()

In [None]:
# The memory_usage() function return the memory usage of each column in bytes.
population.memory_usage()

In [None]:
# The sum() function is used to return the sum of the values for the requested column by the user.
# Here it will give us the total memory usage by every single column
# Which is equavalent to memory usage of the info() function
population.memory_usage().sum()

In [None]:
# To find the data type of a specific column we can use dtype attribute
population['pop_18'].dtype

In [None]:
# The type function returns DataFrame
type(population)

In [None]:
# The describe() method is used for calculating some statistical data like count, 
# mean, std, min, mean and percentile of the numerical values. 
# It analyzes both numeric and object column sets of mixed data types.
population.describe()

In [None]:
# By default the pandas output are normally with 5 decimal places
# It can be changed using the key display.float_format of pandas set_option function
pd.set_option('display.float_format', lambda x: '%.2f' % x)
population.describe()

In [None]:
# The describe() method can be used for calculating statistical data on a specific column
population['pop'].describe()

<h2>Selection rows and columns</h2>
<p>Let's learn the various methods to grab data from a DataFrame</p><hr/>

In [None]:
# How to retrieve a specific column
# Pass the column names as index
population['state']

In [None]:
# How to retrieve more than one column
# Pass multiple column names as index list 
population[['state', 'pop']]

In [None]:
# Since rows do not have index we can pull out the row using position number only
# How to retrieve a specific row using row index (based on position)
# Row index start with 0
population.iloc[0]

In [None]:
# How to retrieve more than one row using row index (based on position)
# Row index start with 1, 3, 5, 7
population.iloc[[1, 3, 5, 7, 9]]

<h2>Add, Drop, Rearrange Columns and Rows</h2>
<p>
Let's learn how to do the following items
    <ol>
        <li>Drop a column from a DataFrame</li>
        <li>Drop a row from a DataFrame</li>
        <li>Add a column to the DataFrame</li>
        <li>Add a row to the DataFrame</li>
        <li>Rearrange the columns in the DataFrame</li>
    </ol>
</p>
<hr/>

In [None]:
# Let us drop the idxs column
# By default value of axis is 0 which refers to drop row
# Since we want to drop the column let us use the named parameter axis = 1
# By default the columns is not dropped permanently
# in order to drop the column permanently let us use the named parameter inplace = True
population.drop('idxs', axis=1, inplace=True)
population

In [None]:
# There is a row Malaysia and it has the total population 
# By default value of axis is 0 which refers to drop row
# Since we want to drop the row let us use the named parameter axis = 0
# By default the rows is not dropped permanently
# in order to drop the row permanently let us use the named parameter inplace = True
population.drop([0], axis=0, inplace=True)
population

# Delete the first row using iloc selector
# data = population.iloc[1:,].copy()

In [None]:
# let us find the population less than 18 years old and add that as a new column
population['under_18'] = population['pop'] - (population['pop_18'] + population['pop_60'])
population

In [None]:
# Let us learn how to add a new total row
population.drop('Total', axis=0, inplace=True, errors='ignore')
population.loc['Total'] = population.sum(numeric_only=True)
population.loc['Total', 'state'] = 'Total'
population

In [None]:
# let us rearrange the column in the data frame as state, under_18, pop_18, pop_60 and pop
population = population[['state', 'under_18', 'pop_18', 'pop_60', 'pop']]
population

In [None]:
# Let us verify the total is correct ie. under_18 + pop_18 + pop_60 == pop
population['under_18'] + population['pop_18'] + population['pop_60'] == population['pop']

<h2>Column Name and Index</h2>
<p>Let's learn how to change the column names and also row index. Then we shall try retrive rows, columns using column names and indexes </p>
<hr/>

In [None]:
# Let us rename the column names as state, minor, adult, elderly and total
population.columns = ['state', 'minor', 'adult', 'elderly', 'total']
population

In [None]:
# Only the last row is having Total as indx let us reset that 
population.reset_index()

In [None]:
# Let us convert the state column into an index column
population.set_index('state', inplace=True)
population

In [None]:
# Since every row is attached with state as index 
# Let us learn how to pull out row using specific state name
population.loc['Selangor']

In [None]:
# Let us learn how to pull out multiple rows using multiple state names
population.loc[['Selangor', 'Perak', 'Melaka']]

<h2>Conditional Selection</h2>
<p>Let's learn how to select rows in Pandas DataFrame conditionally</p>
<hr/>

In [None]:
# First let us recreate the population DataFrame from original dataset
population = pd.read_csv(url)
population.drop('idxs', axis=1, inplace=True)
population['under_18'] = population['pop'] - (population['pop_18'] + population['pop_60'])
population = population[['state', 'under_18', 'pop_18', 'pop_60', 'pop']]
population.columns = ['state', 'minor', 'adult', 'elderly', 'total']
population

In [None]:
# Retrive rows that do not have Malaysia in the state column using conditional selection
population.loc[population['state'] != 'Malaysia']

In [None]:
# Retrive the columns states, minor and elderly from population DataFrame
# Based on a condition where 60 years poulation is less than 18 years population 
# and state is not Malaysia
population[(population['minor'] > population['elderly']) & (population['state'] != 'Malaysia')][['state', 'minor', 'elderly']]

In [None]:
# Retrive the rows based on a condition where state name starts with 'S'
population.loc[population['state'].str.startswith('S')]

In [None]:
# Retrive the rows based on a condition where state name starts with 'W'
population.loc[population['state'].str.startswith('W')]

In [None]:
# Retrive the rows based on a condition where state name starts with 'P' and ends with 'g'
# DataFrame string columns has an attribute str and which provides various methods such as contains
# The contains method can take regular expressions
# ^W => starts with W
# n$ => ends with n
# .+ => any number of characters
population.loc[population['state'].str.contains('^P.+g$')]

In [None]:
# Retrive the rows based on a condition where state name starts with 'P', 
# ends with 'g' and must have h in between
population.loc[population['state'].str.contains('^P.+h.+g$')]

In [None]:
# The describe method returns Series
# which means we can use max, min as key
type(population['total'].describe())

In [None]:
# Which means we can use the keys count, mean, std, min, 25%, 50%, 75%, max to retrive value
population_describe = population['total'].describe()
population_describe['max']

In [None]:
# Retrieve the state that has highest population and lowest population
# Using the describe method we can get the maxinum population value (6538000)
# and the describe method also can give us the minimum population value (99600)
# To know which state has the highest population 
population.loc[population['total'] == population_describe['max']]

In [None]:
# Unfortunately Malaysia is not a stat it is the contry total
# Let is delete the first row using iloc selector
population = population.iloc[1:,].copy()
population

In [None]:
# Which means we can use the keys count, mean, std, min, 25%, 50%, 75%, max to retrive value
population_describe = population['total'].describe()
population.loc[population['total'] == population_describe['max']]

In [None]:
# To know which state has the highest population 
population.loc[population['total'] == population_describe['min']]

<h2>Sorting</h2>
<p>Let's learn how to use various methods to sort the given Pandas DataFrame</p>
<hr/>

In [None]:
# The .sort_values() method in Pandas DataFrame helps to sort values in a DataFrame 
# along either axis (columns or rows). 
# Typically, to sort the rows in a DataFrame by the values of one or more columns:    
population.sort_values(by="total")

In [None]:
# Change the sort order
population.sort_values(by="total", ascending=False)

In [None]:
# Sort by multiple columns
population.sort_values(by=["elderly", "adult", "minor"], ascending=False)

In [None]:
# Sort by multiple columns With Different Sort Orders
population.sort_values(by=["elderly", "adult", "minor"], ascending=[False, True, False])

In [None]:
# Pandas allows to choose different sorting algorithms. 
# The available algorithms are quicksort, mergesort, and heapsort. 
# The algorithm used by default when sorting on a single column is quicksort. 
# To change this to a stable sorting algorithm, use mergesort. 
population.sort_values(by="total", ascending=False, kind="mergesort")

In [None]:
# Let us convert the state column into an index column
population.set_index('state', inplace=True)
population

In [None]:
# Sort by index (state) in ascending order
population.sort_index()

In [None]:
# Sort by index (state) in descending order
population.sort_index(ascending=False)

<h2>Methods, Attributes and Universal Functions</h2>
<p>Let's learn how to use the methods, attributes and universal functions that already exists in Pandas DataFrame</p>
<hr/>

In [None]:
# First let us recreate the population DataFrame from original dataset
population = pd.read_csv(url)
population.drop('idxs', axis=1, inplace=True)
population['under_18'] = population['pop'] - (population['pop_18'] + population['pop_60'])
population = population[['state', 'under_18', 'pop_18', 'pop_60', 'pop']]
population.columns = ['state', 'minor', 'adult', 'elderly', 'total']
population

In [None]:
# let us remove the Malaysia row before we start using methods such as sum, mean
population = population.iloc[1:].copy()
population

In [None]:
# The sum() method returns sum of values in the total column
population['total'].sum()

In [None]:
# The mean() method returns mean values in the total column
population['total'].mean()

In [None]:
# The std() method returns standard deviation values in the total column
population['total'].std()

In [None]:
# Retrieve the unique values from the state column
population['state'].unique()

In [None]:
# How many unique values we have in the state column
population['state'].nunique()

In [None]:
# How frequently the unique values appear in the state column
population['state'].value_counts()

<h2>Visualization</h2>
<p>Data Visualization using Matplotlib and Seaborn Library</p>
<hr/>

In [None]:
# Let us plot the graph using Dataframe
ax = population.set_index('state').plot(kind="bar", figsize=(16, 6))
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(16)
plt.title("Malaysia Population By State", fontsize=22)
plt.xlabel("State", fontsize=20)
plt.ylabel("Population in Millions", fontsize=20)

In [None]:
# Let us plot the graph using Matplotlib
fig = plt.figure()
ax = fig.add_axes([0,0,2,2])
ax.barh(population['state'], population['total'])
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(16)
plt.title("Malaysia Population By State", fontsize=22)
plt.xlabel("Population in Millions", fontsize=20)
plt.ylabel("State", fontsize=20)
plt.grid()

In [None]:
# Let us plot the graph using Dataframe
sorted_population = population.copy()
sorted_population.loc[14, 'state'] = 'Kuala Lumpur'
sorted_population.loc[16, 'state'] = 'Putrajaya'
sorted_population.sort_values('state', inplace=True)
sorted_population

In [None]:
# Let us plot the graph using Matplotlib
fig = plt.figure(figsize=(12, 8))
ax = fig.add_axes([0,0,1,1])
ax.axis('equal')
value = sorted_population['total']
labels = sorted_population['state']
wedges, labels, autopct = ax.pie(value, labels=labels, autopct='%1.2f%%')
plt.setp(labels, fontsize=12)
plt.title("Malaysia Population By State", fontsize=22)
plt.show()

In [None]:
# Let us plot the graph using Seaborn
plt.figure(figsize=(12, 8))
plt.title("Malaysia Population By State", fontsize=22)
ax = sns.barplot(x='total', y='state', data=population)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(14)
plt.xlabel("Population in Millions", fontsize=20)
plt.ylabel("State", fontsize=20)
plt.grid()