In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSVfile I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from collections import namedtuple
from typing import List


In [None]:
def import_data (data_path : str):
    '''Import the data into a pandas dataframe.'''

    return pd.read_csv (data_path)


def find_match_by_column (column : str, in_column : List[str]):
    def apply_isin_to_df (df):
        '''Return the rows where a value of a column fit a specified condition.'''
        return df[df[column].isin (in_column)]
    return apply_isin_to_df


def find_partial_match_by_column( column : str, in_column : str):
    '''Return the rows where a partial value is found in a column.'''

    def apply_contains_to_df (df):
        return df[df[column].str.contains (in_column)]
    return apply_contains_to_df


def find_unique_values (column : str, count : bool = False):
    '''Determine unique value in a column.
    
    If count is 'False,' the function returns a list of unique entries is the 
    from the selected column.

    If count is 'True,' the function returns a value for the number of unique 
    entries in the selected column.
    '''
    def select_df_to_find_unique (df):
        if count is False:
            return df.loc[:, column].unique ()
        else:
            return df.loc[:, column].nunique ()
    return select_df_to_find_unique


def get_yearly_data (start : int, stop : int):
    '''Return only the columns with annual immigration numbers.'''
    def select_df_to_slice (df):
        return df.iloc[:, start: stop]
    return select_df_to_slice


def get_total_annual_immigration_numbers (df):
    '''Return the annual total immigration nunbers.'''

    final_data = (
        df
        .sum (axis = 0)
        .to_frame ()
        .reset_index ()
        .rename (columns = {"index": "Year", 0: "Annual_immigration"})
    )

    return final_data


def create_pipeline (*list_of_functions : list):
    def pipeline (input : pd.DataFrame):
        res = input

        for function in list_of_functions:
            res = function (res)

        return res
    return pipeline


###   Shrink dataframe
def stack_df_with_country_and_annual_data (df):
    '''Covert wide dataframe with individual years as columns to a dataframe with one column for the year and one column
    for the immigration numbers.
    '''
    stacked_df = (
        df
        .iloc[:, np.r_[0, 4: 38]]
        .set_index ("Country")
        .stack ()
        .to_frame ()
        .reset_index ()
        .rename (columns = {"level_1": "Year", 0: "Immigrants_numbers"})
    )

    return stacked_df


def line_plot (df):
    fig = px.line (df, x = "Year", y = "Immigrants_numbers", color = "Country",
                  labels = {"Immigrants_numbers": "Number of Immigrants"})
    fig.update_layout (title_text = "Change in immigration numbers by country",
                      title_font_size = 30,
                      showlegend = False)
    fig.show ()


def scatter_plot (data):
    plt.figure (figsize = (15, 6))
    ax = sns.scatterplot (
        data = data,
        x = "Year",
        y = "Annual_immigration"
    )
    plt.xticks (rotation = 55)
    ax.set (xlabel = "Year", ylabel = "Total annual immigration")
    plt.show ()


def get_top_n_by_year (column : str = "Country", top_n : int = 10):
    '''Get the top N Countries with immigrants to Canada.
    
    The default is to find the top 10 values per column.
    '''
    def apply_top_n_to_df (df):

        top_10_countries = pd.concat (df.nlargest (top_n, yr).loc[:, column] for yr in df.columns[4: 38])

        return top_10_countries
    return apply_top_n_to_df



def get_occurance_in_top_10 (top_10_list):
    '''Get a country of the number of times a country
    appears in the top 10 country of origin for immigrants.'''
    return top_10_list.value_counts ()


def top_10_df_conversion (data):
    '''Convert pandas series with countries in top 10 and 
    the count to a dataframe.'''
    df = (
        data
        .to_frame ()
        .reset_index ()
        .rename (columns = {"index": "Country", "Country": "Occurance"})
    )
    return df


def bar_chart (data):
    '''Plot frequency of countries in top 10 lists.'''
    plt.figure (figsize = (15, 6))
    ax = sns.barplot (
        data = data,
        x = "Country",
        y = "Occurance"
    )
    plt.xticks (rotation = 55)
    ax.set_xticklabels (labels = data.Country, ha = "right")
    ax.set (xlabel = None, ylabel = "Frequency in top 10")
    plt.show ()


def create_storage_tuple (*parameters):
    '''Create a namedtuple for outputing a collection variables with labeled
    variables.

    I prefer namedtuples over regular tuples for function returns.
    Namedtuples allow the for better organisation when multiple variables
    are returned.
    This function expedites the process of creating a namedtuple.

    '''
    return namedtuple ('Storage', parameters)


def set_train_test_data (degree : int = 1, percentage : float = 0.75):
    '''Create training and test sets.
    
    Degree: For polynomial feautre engineering. Indicates the degree for the polynomial
    transformation of the X variable. By default, degree = 1, so, it is the same as if a
    transformation was not applied.
    
    Percentage: Applies to the train/test split. Indicates the percentage of data to
    include in the training set. By default, 75 % of the data is taken for the 
    training set.
    '''
    def create_train_test_data (df):

        X, y = (
            df.Year.astype (int).to_numpy ().reshape ((-1, 1)),
            df.Annual_immigration.to_numpy ()
        )

        X_t = PolynomialFeatures (degree = degree, include_bias = False).fit_transform (X)

        split = math.ceil ((len (X_t) * percentage))

        X_train = X_t[: split]
        X_test = X_t[split:]

        y_train = y[: split]
        y_test = y[split:]

        return create_storage_tuple ("X_train", "X_test", "y_train", "y_test") (X_train, X_test, y_train, y_test)
    return create_train_test_data


def create_model ():
    '''Create an untrained linear regression model.'''
    return LinearRegression ()


def train_LR_model (data):
    '''Create and train linear regression model.'''
    X = data.X_train
    y = data.y_train

    model = create_model ()

    model.fit (X, y)

    return create_storage_tuple ("model", "X_test", "y_test") (model, data.X_test, data.y_test)


def evaluate_model (fitted_model_and_data):
    '''Using test data and trained model return evaluation metrics: 
    r2, mean squared error, and root mean squared error.'''

    model = fitted_model_and_data.model
    X = fitted_model_and_data.X_test
    y = fitted_model_and_data.y_test

    y_pred = model.predict (X)

    #print(y)
    #print(y_pred)

    r2 = r2_score (y, y_pred)
    mse = mean_squared_error (y, y_pred)
    rmse = np.sqrt (mean_squared_error (y, y_pred))
    score = model.score (X, y)

    return create_storage_tuple ("r2", "mse", "rmse", "score") (r2, mse, rmse, score)


## Show raw data

In [None]:
def show_raw_data ():
    '''Display raw dataset for immigration to Canada.'''
    raw_data = import_data ("/kaggle/input/immigration-to-canada/canadian_immegration_data.csv")
    display (raw_data.head(5))
    print ("\n")
    display (raw_data.columns)


show_raw_data ()


## Examine continent data

In [None]:
def unique_continents ():
    raw_data = import_data ("/kaggle/input/immigration-to-canada/canadian_immegration_data.csv")

    number_of_unique_values = find_unique_values (column = "Continent", count = True)
    unique_continents = find_unique_values (column = "Continent", count = False)

    print (f"There are {number_of_unique_values (raw_data)} continent in the dataset.")
    print (f"The continents include: {unique_continents (raw_data)}")

    print (".............................................................................................")

    what_is_northern_america = find_match_by_column (column = "Continent", in_column = ["Northern America"])
    what_is_latin_america = find_match_by_column (column = "Continent", in_column = ["Latin America and the Caribbean"])

    print (f"Northern America contains the countries: {what_is_northern_america (raw_data).Country.to_list ()}")
    print (f"Latin America and the Caribbean contains the countries: {what_is_latin_america (raw_data).Country.to_list ()}")

    print (".............................................................................................")


unique_continents ()

The dataset contains six continents. Asia, Europe, Africa, and Oceania match the commonly known continents, however, Northern America, and Latin America and the Caribbean are not commonly named continents.

The category "Northern America" only contains the countries and Canada and the U.S.A. All countries South of the U.S., such as Mexico, are categorised as "Latin America and the Caribbean."

## Examine country data

In [None]:
def explore_countries ():
    raw_data = import_data ("/kaggle/input/immigration-to-canada/canadian_immegration_data.csv")

    china_references = create_pipeline (
        find_partial_match_by_column (column = "Country", in_column = "China"),
        display
    )
    
    print ("\n")
    print ("Examination of references to different regions of China.")
    print ("\n")

    china_references (raw_data)

    taiwan_references = create_pipeline (
        find_partial_match_by_column (column = "Country", in_column = "Taiwan"),
        display
    )

    print ("\n")
    print ("Is Taiwan referenced separately from China?")
    print ("\n")

    taiwan_references (raw_data)

    print ("..............................................................................")

    print ("\n")
    print ("Examining protectorate regions.")
    print ("\n")

    lst = ["Puerto", "Rico", "Virgin", "Falklands", "Islands"]

    protectorates = create_pipeline (
        find_partial_match_by_column (column = "Country", in_column="|".join(lst)),
        display
    )

    protectorates (raw_data)

    #protectorates = find_partial_match_by_column (column = "Country", in_column = "|".join (lst))
    #display (protectorates (raw_data))


explore_countries ()


## Examine immigration by country

In [None]:
def immigration_over_time ():
    raw_data = import_data ("/kaggle/input/immigration-to-canada/canadian_immegration_data.csv")

    immigration_plot = create_pipeline (
        stack_df_with_country_and_annual_data,
        line_plot
    )

    immigration_plot (raw_data)


immigration_over_time ()

## Examine total immigration over time

In [None]:
def annual_immigration ():
    raw_data = import_data ("/kaggle/input/immigration-to-canada/canadian_immegration_data.csv")

    annual_immigration_plot = create_pipeline(
        get_yearly_data (start = 4, stop = 38),
        get_total_annual_immigration_numbers,
        scatter_plot
    )

    annual_immigration_plot (raw_data)


annual_immigration ()


## Examine top N countries for immigration to Canada

In [None]:
def determine_frequency_in_top_n ():
    raw_data = import_data ("/kaggle/input/immigration-to-canada/canadian_immegration_data.csv")

    frequency_in_top_n_chart = create_pipeline(
        get_top_n_by_year (column = "Country", top_n = 10),
        get_occurance_in_top_10,
        top_10_df_conversion,
        bar_chart
    )

    frequency_in_top_n_chart (raw_data)


determine_frequency_in_top_n ()


## Perform linear regression on annual immigration data

In [None]:
def linear_regression ():
    raw_data = import_data ("/kaggle/input/immigration-to-canada/canadian_immegration_data.csv")

    data_processing = create_pipeline (
        get_yearly_data (start = 4, stop = 38),
        get_total_annual_immigration_numbers,
        set_train_test_data (degree = 9, percentage = 0.75),
        train_LR_model,
        evaluate_model
    )

    #annual_immigration_data = data_processing (raw_data)

    #create_pipeline ([
    #    set_train_test_data (degree = 9, percentage = 0.75),
    #    train_LR_model,
    #    evaluate_model
    #])

    return data_processing (raw_data)


linear_regression ()
