In this notebook, we look at a dataset of international migration flows into Canada and perform various data visualization tasks.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
import numpy as np # linear algebra
import pandas as pd # data processing
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Remove the last 2 rows - containing total data and data from unknown countries
df_can = pd.read_csv('/kaggle/input/international-migration-flow-canada/Canada.csv', skipfooter = 2, engine = 'python')
df_can.head(7)

In [None]:
df_can.shape

In [None]:
#Drop unnecessary columns
df_can.drop(["FIELD44", "FIELD45", "FIELD46", "FIELD47"], axis = 1, inplace = True)
df_can.drop(["FIELD48", "FIELD49", "FIELD50", "FIELD51"], axis = 1, inplace = True)
df_can.drop(["Type", "Coverage", "AREA", "REG", "DEV"], axis = 1, inplace = True)
df_can.head(7)

In [None]:
df_can.rename(columns = {'OdName': 'Country', 'AreaName': 'Continent', 'RegName': 'Region'},
             inplace = True)
df_can.head(7)

In [None]:
#Let's make all the columns as type 'str'
df_can.columns = list(map(str, df_can.columns))

#Set the index of df_can as 'Country'
df_can.set_index('Country', inplace = True)

#Add total column
df_can['Total'] = df_can.sum(axis = 1)

df_can.head(7)

In [None]:
#store the years in a separate list for later use
years = list(map(str, range(1980, 2014)))

# Let's do some data visualization.

## 1. Line plots

In [None]:
# Create line graph of immigration from Haiti
haiti = df_can.loc["Haiti", years]
haiti.head() #The result is a pdSeries

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

haiti.index = haiti.index.map(int)
haiti.plot(kind = 'line')
plt.title('Immigration from haiti')
plt.ylabel('Number of immigrants')
plt.xlabel('Year')
plt.show()

The massive spike in 2010 is becuase of Canada's decision to acccept immigrants from Haiti in the wake of the Haiti 2010 earthquake.

In [None]:
# Compare immigrants from India and China
df_CI = df_can.loc[['India', 'China'], years]
df_CI

We can't use `df_CI.plot(kind = 'line')`. Recall that pandas plots the indices on the x-axis and the columns as individual lines on the y-axis. 

Since `df_CI` is a dataframe with *country* as the index and *years* as the columns, we must first transpose the dataframe using **transpose() method** to swap rows and columns.

In [None]:
df_CI = df_CI.transpose()
df_CI

In [None]:
df_CI.index = df_CI.index.map(int)
df_CI.plot(kind = 'line')
plt.title('Immigration from India and China')
plt.ylabel('Number of immigrants')
plt.xlabel('Year')
plt.show()

In [None]:
# Comparing top 5 countries that contributed the most immigrants
df_can.sort_values(by = 'Total', ascending = False, axis = 0, inplace = True)
df_top5 = df_can.head()
df_top5

In [None]:
df_top5 = df_top5[years].transpose()
df_top5.index = df_top5.index.map(int)
df_top5.plot(kind = 'line', figsize = (14, 8))
plt.margins(x = 0)
plt.title('Top 5 countries with the most immigrants')
plt.xlabel('Year')
plt.ylabel('Number of immigrants')
plt.show()

## 2. Area plot

In [None]:
# Using the same df_top5 dataset
df_top5.plot(kind = 'area', figsize = (14, 8))
plt.margins(x = 0)
plt.title('Top 5 countries with the most immigrants')
plt.xlabel('Year')
plt.ylabel('Number of immigrants')
plt.show()

In [None]:
# Unstacked area plot
df_top5.plot(kind = 'area', stacked = False, figsize = (14, 8))
plt.margins(x = 0)
plt.title('Top 5 countries with the most immmigrants')
plt.xlabel('Year')
plt.ylabel('Number of immmigrants')
plt.show()

In [None]:
# Using the artist layer
ax = df_top5.plot(kind = 'area', alpha = 0.35, figsize = (14, 8))
ax.set_xlabel('Year')
ax.set_ylabel('Number of immigrants')
ax.set_title('Top 5 countries with the most immigrants')

## 3. Histograms

In [None]:
# Purpose - To represent the frequency distribution of a variable
# The distribution of immigrants to Canada in 2013
df_can['2013'].plot(kind = 'hist')
plt.margins(x = 0)
plt.title('Immigrants to Canada in 2013')
plt.xlabel('Number of countries')
plt.ylabel('Number of immigrants')
plt.show()

The bins are not aligned with the tick marks on the horizontal axis. *One* way to solve this problem is to borrow the histogram function from NumPy.

Now, *np.histogram()* does 3 things:
1. it partitions the spread of the data in the '2013' column into **10** bins of equal width, 
2. it computes the number of data points that fall into each bin, and 
3. it then returns the frequency of each bin which we call *count* and the bin edges that we call *bin_edges*.

In [None]:
count, bin_edges = np.histogram(df_can['2013'])
df_can['2013'].plot(kind = 'hist', xticks = bin_edges)
plt.margins(x = 0)
plt.title('Immigrants to Canada in 2013')
plt.xlabel('Number of countries')
plt.ylabel('Number of immigrants')
plt.show()

In [None]:
# List of all colors available in matplotlib
import matplotlib
for name, hex in matplotlib.colors.cnames.items():
    print(name, hex)

In [None]:
# Immigration distribution for Denmark, Norway, Sweden
df3 = df_can.loc[['Denmark', 'Norway', 'Sweden'], years].transpose()
count, bin_edges = np.histogram(df3, 15)

# Unstacked histogram
df3.plot(kind = 'hist', figsize = (10, 6), bins = 15, alpha = 0.6,
        xticks = bin_edges, color = ['coral', 'darkslateblue', 'mediumseagreen'])

plt.margins(x = 0)
plt.title('Immigration from Denmark, Norway and Sweden')
plt.ylabel('Number of years')
plt.xlabel('Number of immigrants')
plt.show()

In [None]:
count, bin_edges = np.histogram(df3, 15)
xmin = bin_edges[0] - 10
xmax = bin_edges[-1] + 10

# Stacked histogram
df3.plot(kind = 'hist', figsize = (10, 6), bins = 15, xticks = bin_edges,
        color = ['coral', 'darkslateblue', 'mediumseagreen'], stacked = True,
        xlim = (xmin, xmax))

plt.margins(x = 0)
plt.xlabel('Number of immigrants')
plt.ylabel('Number of years')
plt.title('Immigration from Denmark, Norway and Sweden')
plt.show()

## 4. Bar charts

In [None]:
# Immigration from Iceland where the height of bar = immigrants in a particular year
iceland = df_can.loc['Iceland', years]
iceland.index = iceland.index.map(int)
iceland.plot(kind = 'bar', figsize = (12, 10), rot = 90)
plt.margins(x = 0)
plt.title('Immigration from Iceland')
plt.xlabel('Year')
plt.ylabel('Number of immigrants')

xlocs, xlabs = plt.xticks()
for i, v in enumerate(iceland.values):
    plt.text(xlocs[i] - 0.33, v + 0.05, str(v))
    
arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3', color = 'blue', lw = 2)
plt.annotate('', xy = (32, 70), xytext = (28, 20), xycoords = 'data', arrowprops = arrowprops)
plt.annotate('2008-2011 Financial Crisis', xy = (
    28, 30), rotation = 76.5, va = 'bottom', ha = 'left')

plt.show()

In [None]:
# Horizontal bar plot
df_top15 = df_can['Total'].head(15)
df_top15.plot(kind = 'barh', figsize = (12, 12), color = 'steelblue')
plt.xlabel('Number of immigrants')
plt.title('Top 15 countries contributing to immigration')

for index, value in enumerate(df_top15):
    label = format(int(value), ',')
    plt.annotate(label, xy = (value - 47000, index - 0.10), color = 'white')

plt.show()

## 5. Pie chart

In [None]:
# Immigration to Canada by continent
df_continents = df_can.groupby('Continent', axis = 0).sum()
df_continents['Total'].plot(kind = 'pie')
plt.title('Immigration to Canada by continent')
plt.show()

Let's add more details to this simple pie chart.

In [None]:
colors_list = ['gold', 'green', 'blue', 'yellow', 'pink', 'red']
#ratio for each continent with which to offset each wedge
explode_list = [0.1, 0, 0, 0, 0.1, 0.1]

df_continents['Total'].plot(kind = 'pie', figsize = (15, 6),
                           autopct = '%1.1f%%', #Add percentages
                           startangle = 90, #start-angle from Africa
                           shadow = True, labels = None,
                           pctdistance = 1.12, #ratio b/w center of each pie slice and start of text generated by autopct
                           colors = colors_list,
                           explode = explode_list)
plt.title('Immigration to Canada by continent', y = 1.12)
plt.axis('Equal') #Sets the pie chart to look like a circle
plt.legend(labels = df_continents.index, loc = 'upper left')
plt.show()

## 6. Box plots

In [None]:
# Immigration from Japan to Canada
japan = df_can.loc[['Japan'], years].transpose()
japan.plot(kind = 'box')
plt.title('Immigrants from Japan')
plt.ylabel('Number of immigrants')
plt.show()

Let's create a boxplot to visualize the distribution of the top 15 countries contibuting to immigration to Canada grouped by the decades 1980s, 1990s, and the 2000s.

In [None]:
df_top15 = df_can.head(15)
df_top15

In [None]:
years_80s = list(map(str, range(1980, 1990)))
years_90s = list(map(str, range(1990, 2000)))
years_00s = list(map(str, range(2000, 2010)))

In [None]:
df_80s = df_top15.loc[:, years_80s].sum(axis = 1)
df_90s = df_top15.loc[:, years_90s].sum(axis = 1)
df_00s = df_top15.loc[:, years_00s].sum(axis = 1)

In [None]:
combined_df = pd.DataFrame({'1980s': df_80s, '1990s': df_90s, '2000s': df_00s})
combined_df.plot(kind = 'box', figsize = (10, 6))
plt.title('Distribution of top 15 countries sending immigrants - grouped by decade')
plt.show()

In [None]:
#small test for outliers
combined_df[combined_df['2000s'] > 200000]

## 7. Scatter plots

In [None]:
# Scatterplot of total annual immigration to Canada
total = pd.DataFrame(df_can[years].sum(axis = 0))
total.index = total.index.map(int)
total.reset_index(inplace = True)
total.columns = ['Year', 'Total']
total.plot(kind = 'scatter', x = 'Year', y = 'Total', figsize = (10, 6))
plt.title('Immigration to Canada - Annual')
plt.ylabel('Number of people')
plt.xlabel('Year')
plt.show()

The scatter plot doesn't connect the data points together. Let's add a line of best fit.

In [None]:
total.plot(kind = 'scatter', x = 'Year', y = 'Total', figsize = (10, 6))
plt.title('Immigration to Canada - Annual')
plt.xlabel('Year')
plt.ylabel('Number of immigrants')

X = total["Year"]
y = total["Total"]
fit = np.polyfit(X, y, deg = 1)
plt.plot(X, fit[0] * X + fit[1], color = 'red')
plt.annotate('y = {0:.0f}x + {1:.0f}'.format(fit[0], fit[1]), xy = (1989, 140000))
plt.show()
print('Number of immigrants = {0:.0f} * Year + {1:.0f}'.format(fit[0], fit[1]))

In [None]:
# Scatter plot of total immigration from Denmark, Norway and Sweden - use df3 dataset
new_df = pd.DataFrame(df3.sum(axis = 1))
new_df.reset_index(inplace = True)
new_df.columns = ['Year', 'Total']

new_df.dtypes
#Change object datatype of 'Year' to int
new_df['Year'] = new_df['Year'].astype(int)

new_df.plot(kind = 'scatter', x = 'Year', y = 'Total', figsize = (10, 6))
plt.title('Immigration from Denmark, Norway and Sweden - Annual')
plt.xlabel('Year')
plt.ylabel('Number of immigrants')
plt.show()

## 8. Bubble plot

In [None]:
# Compare Argentina's immigration pattern to that of Brazil's
# Set weights for the bubble as the 'normalized' value of each year's population
df_can_t = df_can[years].transpose()
df_can_t.index = map(int, df_can_t.index)
df_can_t.index.name = 'Year'
df_can_t.reset_index(inplace = True)
df_can_t.head(7)

In [None]:
# Create normalized weights by min-max strategy
norm_brazil = (df_can_t['Brazil'] - df_can_t['Brazil'].min())/(
    df_can_t['Brazil'].max() - df_can_t['Brazil'].min())

norm_argentina = (df_can_t['Argentina'] - df_can_t['Argentina'].min())/(
    df_can_t['Argentina'].max() - df_can_t['Argentina'].min())

We'll pass in the weights using the 's' parameter. Given that normalized weights $\in [0, 1]$, they won't be visible on the plot. So:
* we multiply weights by 2000 to scale it up on the graph
* we add 10 to compensate for the minimum value (which has a 0 weight)

In [None]:
# Brazil
ax0 = df_can_t.plot(kind = 'scatter', x = 'Year', y = 'Brazil', figsize = (14, 8),
                   alpha = 0.5, color = 'green', s = norm_brazil * 2000 + 10,
                   xlim = (1975, 2015))

# Argentina
ax1 = df_can_t.plot(kind = 'scatter', x = 'Year', y = 'Argentina', figsize = (14, 8),
                   alpha = 0.5, color = 'blue', s = norm_argentina * 2000 + 10,
                   ax = ax0)
ax0.set_ylabel('Number of immigrants')
ax0.set_title('Immigration from Brazil and Argentina')
ax0.legend(['Brazil', 'Argentina'], loc = 'upper left', fontsize = 'x-large')

## 9. Using subplots

In [None]:
# Example from matplotlib documentation
x = np.linspace(0, 2*np.pi, 400)
y = np.sin(x**2)

f, (ax1, ax2) = plt.subplots(1, 2, sharey = True)
ax1.plot(x, y)
ax1.set_title('Sharing y axis')
ax2.scatter(x, y)

In [None]:
fig, (ax0, ax1) = plt.subplots(1, 2, figsize = (20, 6))

df_CI.plot(kind = 'line', ax = ax1)
ax1.set_title('Line plot')
ax1.set_xlabel('Year')
ax1.set_ylabel('Number of immigrants')

df_CI.plot(kind = 'box', color = 'blue', vert = False, ax = ax0)
ax0.set_title('Box plot')
ax0.set_xlabel('Number of immigrants')

fig.suptitle('Immigrants from India and China')
plt.show()

## 10. Waffle chart

In [None]:
df_dsn = df_can.loc[['Denmark', 'Norway', 'Sweden'], :]
df_dsn

In [None]:
# Step 1 - Determine proportion of each category w.r.t total
total_values = sum(df_dsn['Total'])
category_ratios = [(float(value) / total_values) for value in df_dsn['Total']]
for i, proportion in enumerate(category_ratios):
    print(df_dsn.index.values[i] + ': ' + str(proportion))

In [None]:
# Step 2 - Define overall size of waffle chart
width = 40
height = 10
total_num_tiles = width * height

In [None]:
# Step 3 - Compute number of tiles for each category
tiles_per_category = [round(proportion * total_num_tiles) for proportion in category_ratios]
for i, tiles in enumerate(tiles_per_category):
    print(df_dsn.index.values[i] + ': ' + str(tiles))

In [None]:
# Step 4 - Create a matrix resembling the waffle chart and populate it
waffle_chart = np.zeros((height, width))

# Define indices to loop through the waffle chart
category_index = 0
tile_index = 0

for col in range(width):
    for row in range(height):
        tile_index += 1
        if tile_index > sum(tiles_per_category[0:category_index]):
            category_index += 1
        # Set class value to an integer that increases with class
        waffle_chart[row, col] = category_index

In [None]:
# Step 5 - map the waffle chart matrix into a visual
import matplotlib.patches as mpatches

fig = plt.figure()
colormap = plt.cm.coolwarm
plt.matshow(waffle_chart, cmap = colormap)
plt.colorbar()

ax = plt.gca()
ax.set_xticks(np.arange(-0.5, (width), 1), minor = True)
ax.set_yticks(np.arange(-0.5, (height), 1), minor = True)
ax.grid(which = 'minor', color = 'w', linestyle = '-', linewidth = 2)
plt.xticks([])
plt.yticks([])

# Compute cumulative sum of individual categories to 
# match color schemes between chart and legend
values_cumsum = np.cumsum(df_dsn['Total'])
total_values = values_cumsum[len(values_cumsum) - 1]

legend_handles = []
for i, category in enumerate(df_dsn.index.values):
    label_str = category + '(' + str(df_dsn['Total'][i]) + ')'
    color_val = colormap(float(values_cumsum[i])/total_values)
    legend_handles.append(mpatches.Patch(color = color_val, label = label_str))

plt.legend(handles = legend_handles, loc = 'lower center',
           ncol = len(df_dsn.index.values), bbox_to_anchor = (0., -0.2, 0.95, .1))

## 11. Word cloud

Using countries with 1 word name, we are going to see how much they contribute to the total immigration through a word cloud.

In [None]:
from PIL import Image #converts images into arrays
!pip install wordcloud

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
total_immigration = df_can['Total'].sum()

# Using countries with 1 word name, we duplicate each country's name
# based on how much they contribute to the total immigration
max_words = 90
word_string = ''

for country in df_can.index.values:
    if(len(country.split(' ')) == 1):
        repeat_num_times = int(df_can.loc[country, 'Total'] / float(
            total_immigration) * max_words)
        word_string = word_string + ((country + ' ') * repeat_num_times)

In [None]:
wc = WordCloud(background_color = 'white').generate(word_string)

fig = plt.figure()
fig.set_figwidth(14)
fig.set_figheight(18)
plt.imshow(wc, interpolation = 'bilinear')
plt.axis('off')
plt.show()

**Disclaimer:** This notebook is based heavily on IBM's 'Data visualization' course in their Data Science Professional Certificate.