## Assignment

Import necessary library

In [None]:
import numpy as np  # useful for many scientific computing in Python
import pandas as pd # primary data structure library

Read the data

In [None]:
df_can = pd.read_excel('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DV0101EN/labs/Data_Files/Canada.xlsx',
                       sheet_name='Canada by Citizenship',
                       skiprows=range(20),
                       skipfooter=2)

Drop unnecessary columns

In [None]:
# in pandas axis=0 represents rows (default) and axis=1 represents columns.
df_can.drop(['AREA','REG','DEV','Type','Coverage'], axis=1, inplace=True)
df_can.head(2)

Rename columns title

In [None]:
df_can.rename(columns={'OdName':'Country', 'AreaName':'Continent', 'RegName':'Region'}, inplace=True)
df_can.columns

Add a 'Total' column

In [None]:
df_can['Total'] = df_can.select_dtypes(include=['number']).sum(axis=1)


**Question 1:** Let's compare the number of immigrants from India and China from 1980 to 2013.


Step 1: Get the data set for China and India, and display dataframe.

In [None]:
df_india = df_can[df_can.apply(lambda row: row.astype(str).str.contains('India', case=False, na=False).any(), axis=1)]
df_china = df_can[df_can.apply(lambda row: row.astype(str).str.contains('China', case=False, na=False).any(), axis=1)]

# Menambahkan kolom identitas untuk groupby
df_india = df_india.assign(Country_Filtered='India')
df_china = df_china.assign(Country_Filtered='China')

# Menggabungkan data
df_combined = pd.concat([df_india, df_china])

# Menggabungkan baris yang sama dan menjumlahkan nilai numerik
df_combined = df_combined.groupby("Country_Filtered", as_index=False).sum(numeric_only=True)
"""menggunakan sum numeric only jadi agar ketika negara cina yang daerah lainnya
dapat dijadikan satu """
display(df_combined)


Step 2: Plot graph. We will explicitly specify line plot by passing in `kind` parameter to `plot()`.

In [None]:
import matplotlib.pyplot as plt

tahun = df_combined.columns[1:-1]  
#Ambil semua kolom kecuali Country_Filtered dan Total
#Country_Filtered = 0, kalo dari paling terakhir = -1 jadi krg 1 dri blk

# Plot data
ax = df_combined.set_index("Country_Filtered")[tahun].T.plot(kind='line', 
                            figsize=(10, 5), color=['blue', 'red'])
#

plt.title("Jumlah Data Imigrant per Tahun China dan India")
plt.xlabel("Tahun")
plt.ylabel("Jumlah")
plt.legend(title="Negara")
plt.grid(True)

# Menampilkan grafik
plt.show()


**Question 2:** Compare the trend of top 5 countries that contributed the most to immigration to Canada.

Step 1: Get the data set for top 5 countries

In [None]:
# Mengurutkan berdasarkan kolom "Total" dari terbesar ke terkecil, lalu ambil 5 negara teratas
df_top5 = df_can.sort_values(by="Total", ascending=False).head(5)

# Menampilkan hasil
display(df_top5)


Step 2: Plot graph

In [None]:
df_top5 = df_can.sort_values(by="Total", ascending=False).head(5)
# Mengambil kolom tahun (dari kolom ke-2 hingga sebelum kolom "Total")
tahun = df_can.columns[4:-1]  # Ambil semua kolom kecuali "Country" dan "Total"

# Membuat plot
plt.figure(figsize=(12, 6))

# Loop untuk membuat garis untuk setiap negara
for country in df_top5["Country"]:
    plt.plot(tahun, df_top5[df_top5["Country"] == country][tahun].values.flatten(), label=country)

# Menambahkan judul, label, dan legend
plt.title("Top 5 Negara dengan Jumlah Terbanyak (Berdasarkan Total)")
plt.xlabel("Tahun")
plt.ylabel("Jumlah")
plt.legend(title="Negara")
plt.grid(True)

# Menampilkan plot
plt.show()


**Question 3**: Create an unstacked area plot of the 5 countries that contributed the least to immigration to Canada **from** 1980 to 2013. Use a transparency value of 0.55.

In [None]:
df_top5 = df_top5.set_index("Country")[tahun].T
df_top5 = df_top5[df_top5.columns[::-1]]  # Membalik urutan kolom (negara)

# Membuat area plot dengan transparansi 0.55
plt.figure(figsize=(12, 6))
df_top5.plot(kind="area", alpha=0.55, figsize=(12, 6))

# Menambahkan judul, label, dan legend
plt.title("Immigration Trend of Top 5 Countries")
plt.xlabel("Years")
plt.ylabel("Number of Immigrants")
plt.legend(title="Country")
plt.grid(True)

# Menampilkan plot
plt.show()

**Question 4**: Display the immigration distribution for Greece, Albania, and Bulgaria for years 1980 - 2013? Use an overlapping plot with 15 bins and a transparency value of 0.35.

In [None]:
negara_tambahan = ["Greece", "Albania", "Bulgaria"]
df_extra = df_can[df_can["Country"].isin(negara_tambahan)].set_index("Country")[tahun]

# Transpose TIDAK diperlukan jika kita hanya ingin mengambil nilai (series) per negara
# karena histogram akan menganggap sekumpulan nilai (1980–2013) sebagai 1 distribusi.

plt.figure(figsize=(10, 6))

for negara in negara_tambahan:
    # df_extra.loc[negara] -> Series (jumlah imigran dari 1980-2013)
    plt.hist(df_extra.loc[negara], 
                bins=15, 
                alpha=0.35, 
                label=negara)

plt.title("Distribusi Imigran Negara Greece, Albania, Bulgaria 1980–2013")
plt.xlabel("Jumlah Imigran")
plt.ylabel("angka dalam tahun")
plt.legend()
plt.grid(True)
plt.show()

**Question 5:** Create a *horizontal* bar plot showing the *total* number of immigrants to Canada from the top 15 countries, for the period 1980 - 2013. Label each country with the total immigrant count.

Step 1: Get the data pertaining to the top 15 countries.

In [None]:
df_top15 = df_can.sort_values(by="Total", ascending=False).head(15)
display(df_top15)


Step 2: Plot data:
   1. Use `kind='barh'` to generate a bar chart with horizontal bars.
   2. Make sure to choose a good size for the plot and to label your axes and to give the plot a title.
<!--    3. Loop through the countries and annotate the immigrant population using the anotate function of the scripting interface. -->

In [None]:
# Membuat figure dan axes
plt.figure(figsize=(12, 8))
    
# Membuat horizontal bar chart
bars = plt.barh(df_top15["Country"], df_top15["Total"], color="skyblue")

# Menambahkan label jumlah total imigran di setiap batang
for bar in bars:
    plt.text(bar.get_width() + 5000,  # Posisi teks sedikit di kanan batang
                bar.get_y() + bar.get_height()/2,  # Posisi tengah batang
                f"{int(bar.get_width()):,}",  # Format angka dengan pemisah ribuan
                va="center", ha="left", fontsize=10, color="black")

# Menambahkan judul dan label sumbu
plt.xlabel("Total Immigrants (1980–2013)")
plt.ylabel("Country")
plt.title("Top 15 Countries - Total Immigration to Canada (1980–2013)")
plt.gca().invert_yaxis()  # Membalik urutan agar negara dengan imigran terbanyak di atas
plt.grid(axis="x", linestyle="--", alpha=0.7)  # Grid vertikal untuk membantu pembacaan angka

# Menampilkan plot
plt.show()

# Thank you for completing this lab!

<hr>

Copyright &copy; 2019 [Cognitive Class](https://cognitiveclass.ai/?utm_source=bducopyrightlink&utm_medium=dswb&utm_campaign=bdu). This notebook and its source code are released under the terms of the [MIT License](https://bigdatauniversity.com/mit-license/).