In [None]:
#CELL 1
#This cell imports the census data into a pandas dataframe called "data"

import pandas as pd
import zipfile
import numpy as np

#importing the data
import_zip = "data/98-401-X2021006_Ontario_eng_CSV.zip" #input zip file name here
import_csv = "98-401-X2021006_English_CSV_data_Ontario.csv" #input csv file name here
zf = zipfile.ZipFile(import_zip) 
data = pd.read_csv(zf.open(import_csv), dtype = str, encoding='latin1', chunksize=1000)

#optionally, uncomment the line below to see the dataframe 
#print(data) 

In [None]:
#CELL 2
#This cell creates a new dataframe "df" which only contains data about user selected ALT_GEO_CODE's and CHARACTERISTIC_NAME/ID's.
#For the purpose of the linguistic diversity project, the selected CHARACTERISTIC_NAME/ID's include ALL languages (either mother tongues, most spoken languages, etc)

df=data

#convert the df data types to numeric
df["C1_COUNT_TOTAL"] = df["C1_COUNT_TOTAL"].astype(float)
df["CHARACTERISTIC_ID"] = df["CHARACTERISTIC_ID"].astype(int)
df["ALT_GEO_CODE"] = df["ALT_GEO_CODE"].astype(int)

#limit df to containing only desired CHARACTERISTIC_ID's
characteristic_ids = [] #input list of desired CHARACTERISTIC_ID's (can use the loadtxt function below, or any other function)
#characteristic_ids = np.loadtxt("{}.txt".format(""), unpack=True)
df = df[df['CHARACTERISTIC_ID'].isin(characteristic_ids)]

#limit df to containing only desired ALT_GEO_CODE's
alt_ids = [] #input list of desired ALT_GEO_CODE's (can use the loadtxt function below, or any other function)
#alt_ids= np.loadtxt("{}.txt".format(""), unpack=True) 
df = df[df['ALT_GEO_CODE'].isin(alt_ids)]

#limit df to containing only the ALT_GEO_CODE, CHARACTERISTIC_NAME AND C1_COUNT_TOTAL columns
df = df[["ALT_GEO_CODE","CHARACTERISTIC_NAME","C1_COUNT_TOTAL"]]

#reformat df, using the ALT_GEO_CODE as the index and the CHARACTERISTIC_NAME's as the columns
df = pd.pivot(df, index='ALT_GEO_CODE', columns='CHARACTERISTIC_NAME', values='C1_COUNT_TOTAL')

#optionally, uncomment the lines below to print "df" and save it as a csv file 
#print(df)
#df.to_csv("") #input desired csv file name


In [None]:
#CELL 3
#This cell creates a list of all languages to be kept and deleted, assuming only languages which have more than 10 speakers in more than 10 areas are desired to be kept

language_keep_list=[]
language_delete_list=[]
for j in range(len(df.columns)): #loops through all columns of df
    count=0
    for i in range(len(df)): #loop through all rows of df
        if np.isnan(df.iloc[i, j])==False and df.iloc[i, j]>=10: #if the number of speakers for a given language in a given census tract is greater than 10, increase the "count" value by 1
            count=count+1
    if count<10: #if the total number of census tracts having more than 10 speakers is less than 10, add language to a "delete" list
        #print("deleted", df.columns[j]) #optionally, uncomment to see languages being deleted
        language_delete_list.append(df.columns[j])
    elif count>=10: #if the total number of census tracts having more than 10 speakers is greater than 10, add language to a "keep" list (and remove leading/trailing whitespace in its name)
        language_keep_list.append((df.columns[j]).strip()) 
        df = df.rename(columns={df.columns[j]: (df.columns[j]).strip() }) #rename df columns for kept languages to remove leading/trailing whitespace


In [None]:
#CELL 4
#Assuming the "df" created in cell 2 contains ALL languages as its characteristics (either mother tongues, most spoken languages, etc),
#this cell creates a dataframe "df2" which limits the characteristics to a user-selected list of a desired few languages 
#and adds an "other" column to account for all languages NOT included in the user's selected list
#Note: user can first use the code in CELL 4 below to find list of languages with highest number of speakers

#loops through every language for every ALT_GEO_CODE in the dataframe "df" created in cell 2: if the language is NOT 
#part of a user-selected list, appends the number of people speaking that language to an "other" list
other_list_total=[]
for i in range(len(df)):
    other_list_total.append(0)
    for j in range(len(df.columns)):
        if df.columns[j] not in language_keep_list: #list of column names corresponding to desired languages (NOT to be placed in the "other" category)
            if np.isnan(df.iloc[i, j])==False:
                other_list_total[-1]=other_list_total[-1]+df.iloc[i, j]

#deletes columns corresponding to languages with no enough speakers
for val in language_delete_list:
    del df[val]
    
#creates a new dataframe "df2" which contains as columns only the languages in the user-selected list inputed above + an other category
df2 = df[language_keep_list]
df2.insert(len(language_keep_list), 'Other', other_list_total) 
 
#optionally, uncomment the lines below to print "df2" and save it to a csv file
#print(df2)
#df2.to_csv("") #input desired csv file name  
    

In [None]:
#CELL 5
#This cell creates a dataframe "language_total" containing a count of the total number of people speaking the different languages in "df" (created in cell 2) or in "df2" (created in cell 3).
#Note that "df" (created cell 2) should only be used here if it contains languages (either mother tongues, most spoken languages, etc) and NOT other data as its characteristics 

no_per_language=np.zeros((len(df.columns))) #input df or df2, depending on whether interested in all languages present in df, or in the user-selected languages + other category present in df2
language_list=[]
num=0
#loops through all columns (i.e. languages) of selected dataframe, summing each column
for column in df: #input df or df2
   no_per_language[int(num)]=df[column].sum() #input df or df2
   language_list.append(column)
   num=num+1

#creates a new dataframe "language_total", containing the total number of people speaking each language in df or df2
language_total=pd.DataFrame(no_per_language, columns=['Total'])
language_total.insert(1, "Language", language_list)
language_total=language_total.sort_values(by=['Total'], ascending=False)

#optionally, uncomment the lines below to print the "language_total" dataframe and save it to a csv file
#print(language_total)
#language_total.to_csv("") #input desired csv file name


#optionally, uncomment the lines below to display a bar plot showing the number of people speaking each language in df or df2
#import matplotlib.pyplot as plt
#plt.figure(figsize=(20, 15))
#plt.bar(language_total["Language"][:], language_total["Total"][:])
#plt.xticks(rotation=90, fontsize=10)
#plt.xlabel("Language", fontsize=20)
#plt.ylabel("No. of People", fontsize=20)
#plt.title("", fontsize=30) #input desired plot title

In [None]:
#CELL 6
#Assuming the "df" created in cell 2 contains ALL languages as its characteristics (either mother tongues, most spoken languages, etc),
#this cell creates a dataframe "no_languages" containing the number of languages spoken per ALT_GEO_CODE region

#loops thorugh all regions and counts how many languages in each region have a number of speakers > 0
num_per_region=[]
index=[]
for i in range(len(df)): #assumes df contains ALL languages as columns!
    index.append(df.index[i])
    num_per_region.append(0)
    for j in range(len(df.columns)):
        if np.isnan(df.iloc[i, j])==False and df.iloc[i, j]!=0:
            num_per_region[-1]=num_per_region[-1]+1


#creates new dataframe "no_languages", containing the number of languages spoken in each region
d={"No_Languages": num_per_region}
no_languages=pd.DataFrame(data=d, index=index)
no_languages.index.name="ALT_GEO_CODE"

#optionally, uncomment the lines below to print the "no_languages" dataframe and save it to a csv file
#print(no_languages)
#no_languages.to_csv("") #input desired csv file name

In [None]:
#CELL 7
#This cell defines the function to be used in creating a user-selected number of randomly placed points the polygon regions (corresponding to the ALT_GEO_CODEs) for each desired language

#imports libraries
from shapely.geometry import Point
import random
import geopandas

#defines function which creates a user-selected "number" of random points in each polygon region for each desired language 
def gen_dot(polygon, number):
    points = []    
    while len(points) < number:
        pnt = Point(random.uniform(polygon.bounds[0], polygon.bounds[2]), random.uniform(polygon.bounds[1], polygon.bounds[3]))
        if (polygon.contains(pnt)==True):
            points.append([pnt.x,pnt.y])
    return points

In [None]:
#CELL 7
#This cell creates randomly placed points in the polygon regions (corresponding to the ALT_GEO_CODEs) for each language which has more than 10 speakers

#reads-in shapefile with desired polygons in which to generate the random points
shapefile = geopandas.read_file("")

#specifies the coordinate reference system of the shapefile
shapefile.crs=

#initializes empty lists
dauid_list=[]
x_list=[]
y_list=[]
language_list=[]
speaker_no_list=[]

#loops through all polygons and all languages; creates 1 randomly placed point in each polygon for each language which has more than 10 speakers in that given polygon
#saves the ALT_GEO_CODEs and coordinates of the randomly generated points, as well as their correponding languages and number of speakers, in separate lists
for alt_code in list(df2.index.values): #input df or df2, depending on whether interested in all languages present in df, or in the user-selected languages + other category present in df2
    for col in list(df2.columns.values): #input df or df2
        if df2.loc[alt_code, col]>=10: #input df or df2; input desired threshold number of speakers for the languages above which a point will be generated for the latter
            dauid_list.append(alt_code)
            point=gen_dot(shapefile.iloc[shapefile[shapefile['DAUID'] == str(alt_code)].index.to_numpy()[0], 4], 1)[0] #modify indicies according to shapefile
            x_list.append(point[0])
            y_list.append(point[1])
            language_list.append(col)
            speaker_no_list.append(df2.loc[alt_code, col]) #input df or df2

#creates a dataframe of the ALT_GEO_CODES, the coordinates of the random points placed within the polygons corresponding to them, the languages corresponding to the random points, and the latters' respective number of speakers
dict={"DAUID": dauid_list, "x": x_list, "y": y_list, "Language": language_list, "Speaker_No": speaker_no_list}
rand_points = pd.DataFrame(dict)


#transforms the dataframe into a geodataframe using geopandas
gdf = geopandas.GeoDataFrame(
    rand_points, geometry=geopandas.points_from_xy(rand_points.x, rand_points.y))

#optionally,  uncomment the lines below to print "gdf" and save it as a geojson file
#print(gdf)
#gdf.to_file("", driver='GeoJSON')