# Quick Australian population EDA and Visualisation

## 1. Importing libraries and dataset

In [None]:
import os

import numpy as np 
import pandas as pd 
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_master = pd.read_csv('/kaggle/input/worldcities-australia/au.csv')

## 2. A brief EDA

In [None]:
df_master.info()

* Only missing data in the "captial" column. This is not a concern for this analysis

In [None]:
df_master.head(10)

In [None]:
# Number of cities in each state and territory (identified by the admin name). There are 8 states / territories in Australia
df_master['admin_name'].value_counts()

In [None]:
"""

Looking at the total population in each state / territory

"""

df_master.sort_values(by='population',ascending=False,inplace=True)
state_territory_list = np.unique(df_master['admin_name'])



populations = []
for st in state_territory_list:
    pop = sum(df_master[df_master['admin_name'] == st]['population'])
    print(f"Population of all cities in {st} = {pop}\n")
    populations.append(pop)
    

plt.barh(state_territory_list, populations)
plt.yticks(state_territory_list, state_territory_list)
plt.title('Population of each state and territory')
plt.show()

In [None]:
"""

Top 10 cities with the largest population

"""



print(df_master[['city','population']].head(10))

plt.barh(df_master.loc[0:9,'city'], df_master.loc[0:9,'population'])
plt.yticks(df_master.loc[0:9,'city'], df_master.loc[0:9,'city'])
plt.ticklabel_format(style='plain', axis='x')
plt.title('10 most populated cities')
plt.show()

In [None]:
def summary_stats(df,columns):
    for col in columns:
        print(f'Column --> {col}')
        print(f'Mean --> {df[col].mean()}')
        print(f'Median -- {df[col].median()}')
        print(f'Standard Deviation --> {df[col].std()}')
        print(f'Quantile 0.05 --> {df[col].quantile(0.05)}')
        print(f'Quantile 0.25 --> {df[col].quantile(0.25)}')
        print(f'Quantile 0.50 --> {df[col].quantile(0.50)}')
        print(f'Quantile 0.75 --> {df[col].quantile(0.75)}')
        print(f'Quantile 0.95 --> {df[col].quantile(0.95)}')
        print('\n\n')

In [None]:
"""

Some summary statistics for the population and population_proper columns

"""

summary_stats(df_master,['population','population_proper'])

In [None]:
"""

Assigning Id numbers for each row based on the quantile that it belongs to (population column)

quantile < 0.05 -> 1
0.05 <= quantile < 0.25 -> 2
0.25 <= quantile < 0.5 -> 3
0.5 <= quantile < 0.75 -> 4
0.75 <= quantile < 0.95 -> 5
quantile >= 0.95 -> 6

"""

quantiles = df_master['population'].quantile([ 0.05 , 0.25 , 0.5 , 0.75 , 0.95 ])
df_master['quantile_id'] = 0
for index,row in df_master.iterrows():
    if df_master.at[index,'population'] < quantiles[0.05]:
        df_master.at[index,'quantile_id'] = 1
    elif df_master.at[index,'population'] >= quantiles[0.05] and  df_master.at[index,'population'] < quantiles[0.25]:
        df_master.at[index,'quantile_id'] = 2
    elif df_master.at[index,'population'] >= quantiles[0.25] and  df_master.at[index,'population'] < quantiles[0.5]:
        df_master.at[index,'quantile_id'] = 3
    elif df_master.at[index,'population'] >= quantiles[0.5] and  df_master.at[index,'population'] < quantiles[0.75]:
        df_master.at[index,'quantile_id'] = 4
    elif df_master.at[index,'population'] >= quantiles[0.75] and  df_master.at[index,'population'] < quantiles[0.95]:
        df_master.at[index,'quantile_id'] = 5
    else:
        df_master.at[index,'quantile_id'] = 6


In [None]:
df_master.head()

In [None]:
df_master["quantile_id"].value_counts()

In [None]:
"""

plots for quantile id = 1

"""

# Density chart
sns.kdeplot(df_master[df_master['quantile_id'] == 1]['population'], shade=True)
plt.title('quantile id = 1\n')
plt.show()

# Violin plot
ax = sns.violinplot(x='quantile_id', y='population', data=(df_master[df_master['quantile_id'] == 1]))
ax = sns.stripplot(x='quantile_id', y='population', data=(df_master[df_master['quantile_id'] == 1]), color="red", jitter=0.2, size=6)
plt.title("quantile id = 1", loc="left")
plt.show()

In [None]:
"""

plots for quantile id = 2

"""

# Density chart
sns.kdeplot(df_master[df_master['quantile_id'] == 2]['population'], shade=True)
plt.title('quantile id = 2\n')
plt.show()

# Violin plot
ax = sns.violinplot(x='quantile_id', y='population', data=(df_master[df_master['quantile_id'] == 2]))
ax = sns.stripplot(x='quantile_id', y='population', data=(df_master[df_master['quantile_id'] == 2]), color="red", jitter=0.2, size=6)
plt.title("quantile id = 2", loc="left")
plt.show()

In [None]:
"""

plots for quantile id = 3

"""

# Density chart
sns.kdeplot(df_master[df_master['quantile_id'] == 3]['population'], shade=True)
plt.title('quantile id = 3\n')
plt.show()

# Violin plot
ax = sns.violinplot(x='quantile_id', y='population', data=(df_master[df_master['quantile_id'] == 3]))
ax = sns.stripplot(x='quantile_id', y='population', data=(df_master[df_master['quantile_id'] == 3]), color="red", jitter=0.2, size=6)
plt.title("quantile id = 3", loc="left")
plt.show()

In [None]:
"""

plots for quantile id = 4

"""

# Density chart
sns.kdeplot(df_master[df_master['quantile_id'] == 4]['population'], shade=True)
plt.title('quantile id = 4\n')
plt.show()

# Violin plot
ax = sns.violinplot(x='quantile_id', y='population', data=(df_master[df_master['quantile_id'] == 4]))
ax = sns.stripplot(x='quantile_id', y='population', data=(df_master[df_master['quantile_id'] == 4]), color="red", jitter=0.2, size=6)
plt.title("quantile id = 4", loc="left")
plt.show()

In [None]:
"""

plots for quantile id = 5

"""

# Density chart
sns.kdeplot(df_master[df_master['quantile_id'] == 5]['population'], shade=True)
plt.title('quantile id = 5\n')
plt.show()

# Violin plot
ax = sns.violinplot(x='quantile_id', y='population', data=(df_master[df_master['quantile_id'] == 5]))
ax = sns.stripplot(x='quantile_id', y='population', data=(df_master[df_master['quantile_id'] == 5]), color="red", jitter=0.2, size=6)
plt.title("quantile id = 5", loc="left")
plt.show()

In [None]:
"""

plots for quantile id = 6

"""

# Density chart
sns.kdeplot(df_master[df_master['quantile_id'] == 6]['population'], shade=True)
plt.title('quantile id = 6\n')
plt.show()

# Violin plot
ax = sns.violinplot(x='quantile_id', y='population', data=(df_master[df_master['quantile_id'] == 6]))
ax = sns.stripplot(x='quantile_id', y='population', data=(df_master[df_master['quantile_id'] == 6]), color="red", jitter=0.2, size=6)
plt.title("quantile id = 6", loc="left")
plt.ticklabel_format(style='plain', axis='y')
ax.set(ylim=(0, 6000000))
plt.show()

In [None]:
# Geospatial visualisation of the cities in the dataset
# Colour and size based on quantile id
fig = px.scatter_geo(df_master, lat=df_master['lat'],lon=df_master['lng'],
                     color="quantile_id", 
                     hover_name="city", 
                     hover_data=["population"],
                     size="quantile_id", 
                     projection="natural earth")
fig.show()

# Geospatial visualisation of the cities in the dataset
# Colour based on state / territory and size based on quantile id
fig = px.scatter_geo(df_master, lat=df_master['lat'],lon=df_master['lng'],
                     color="admin_name", 
                     hover_name="city", 
                     hover_data=["population"],
                     size="quantile_id", 
                     projection="natural earth")
fig.show()

### Thanks to Marya Alizadeh for providing the dataset 