In [None]:
# !pip3 install matplotlib
# !pip3 install tabula-py
# !pip3 install PyPDF2
# !pip3 install pdfminer.six
# !pip3 install PyMuPDF
# !pip3 install camelot-py
# !pip3 install --upgrade pip
# !pip3 install geopandas
# !pip3 install plotly
# !pip3 install pandas
# !pip3 install pdfplumber
# !pip3 install scipy
# !pip3 install --upgrade pandas

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpt
import os
import PyPDF2
import tabula
import tabulate
import pdfminer
import geopandas as gpd
import fiona
from glob import glob
import plotly.express as px
from shapely.geometry import Point
import pdfplumber
import scipy


In [None]:

def save_to_csv(df, output_folder, file_name):
    """AI is creating summary for save_to_csv

    Args:
        df (Pandas Data Frame): Contains data frame that needs to be converted and saved as csv
        output_folder (Str): Destination path where df needs to be saved
        file_name (Str): File name that df needs to be saved as
    """
    # Check if the output folder exists, if not, create it
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Define the output file path
    output_file_path = os.path.join(output_folder, file_name)
    
    # Save the DataFrame to CSV in the output folder
    df.to_csv(output_file_path, index=False)
    print(f"DataFrame saved to: {output_file_path}")


In [None]:
def convert_data (file_path):
    """
    Function to convert files of appropriate structure to csv format, creating point object with files that have lat and lon values

    Args:
        file_path (str): File Path to open and convert to csv file format
    """
    df = pd.read_csv(file_path)
    output_folder_path = os.getcwd()+"/Data Sets/Analysis Data/"
    # print(output_folder_path)
    output_file_name = file_path.split('/')[-1].split('.')[0] + ".csv"
    # output_destination = os.path.join(output_folder_path, output_file_name)
    if 'lat' in str(df.columns) and 'lon' in str(df.columns):
        lat = ""
        lon = ""
        for col in df.columns:
            if str(col).endswith('lat'):
                lat = str(col)
            elif str(col).endswith('lon'):
                lon = str(col)
        df_geo = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[lat],df[lon]))
        if not os.path.exists(output_folder_path):
            os.make_dir(output_folder_path)
        # df_geo.drop(columns=[lat, lon]).to_csv(output_destination, index = False)
        final_df = df_geo.drop(columns=[lat, lon])
        save_to_csv(final_df, output_folder_path, output_file_name)
        print("{} saved in {}".format(output_file_name, output_folder_path))
    else:
        # df.to_csv(output_destination, index=False)
        save_to_csv(df, output_folder_path, output_file_name)
        print("{} saved in {}".format(output_file_name, output_folder_path))

In [None]:
# data_path = input("include relative path to the dataset: (should work for windows/mac: )")
data_path = os.getcwd()+"/Data Sets/CT_GTFS/agency.txt"
# with open(data_path, 'r') as f:
#     print(f.read())
convert_data(data_path)
# shape = pd.read_csv(data_path)
# shape_geo = gpd.GeoDataFrame(shape)


In [None]:
# community_path = os.getcwd() + "/Community Profiles/"
# ward_path = os.getcwd() + "/ward_profiles/"

# if os.path.isdir(ward_path):
#     try:
#         # List all files in the specified folder
#         files = os.listdir(ward_path)

#         # Extract file names
#         file_names = [file for file in files if os.path.isfile(os.path.join(ward_path, file)) and "ward" in file]

#         # print("File names in the folder:")
#         # for file_name in file_names:
#         #     print(file_name)

#     except FileNotFoundError:
#         print(f"The specified folder '{ward_path}' does not exist.")
#     except PermissionError:
#         print(f"Permission denied to access '{ward_path}'.")

# # data = camelot.read_pdf(ward_path+file_names[0])

In [None]:
# save_to_csv(combined_df, os.getcwd()+"/Data Sets/Analysis Data", "Ward1.csv")

In [None]:
community_data_path = os.getcwd()+"/Data Sets/Analysis Data/Community Profiles Compiled.xlsx"

In [None]:
comm_df = pd.read_excel(community_data_path)
# comm_df.head()

In [None]:
comm_df.head()
# summary_stat = comm_df.describe()
# print(type(summary_stat))
# comm_df.set_index("Community Name", inplace=True)

In [None]:

# Initialize an empty list to hold dictionaries of row data
rows_to_append = []

# Loop through rows of comm_df and append data to rows_to_append list
for index, row in comm_df.iterrows():
    comm_low = row['Population in private households to whom low income concepts are applicable (Number in low income)']
    comm_pop = row['Population in private households']
    comm_seniors = row['65 to 84 years']
    comm_lab_trans = (comm_pop/row["Employed labour force aged 15 years and over in private households"])*row['Public transit']
    comm_rent = row["Per cent households with income spending 30% or more total income on shelter (Renter)"]
    
    # Calculate index values
    low_income_index = (comm_low / comm_pop) * 100
    seniors_index = (comm_seniors / comm_pop) * 100
    labour_transit_index = (comm_lab_trans / comm_pop) * 100
    rent_index = comm_rent * 100
    
    # Create a dictionary for the row and append it to the list
    index_columns = {"Community Name": row['Community Name'], "Low Income Index": low_income_index, "Seniors Index": seniors_index, "Public Transit Index": labour_transit_index, "Rent Index": rent_index}
    rows_to_append.append(index_columns)

# Convert the list of dictionaries to a DataFrame
first_index_df = pd.DataFrame(rows_to_append)



In [None]:
first_index_df.describe()

In [None]:
output_folder_path = os.getcwd()+"/Data Sets/Analysis Data"

In [80]:
#Z-Score method manually
pop_mean = comm_df['Population in private households'].mean()
pop_std = comm_df['Population in private households'].std()
# pop_max = comm_df['Population in private households'].max()
# pop_min = comm_df['Population in private households'].min()

medinc_mean = comm_df["Median household income of private households"].mean()
medinc_std = comm_df["Median household income of private households"].std()

low_mean = comm_df['Population in private households to whom low income concepts are applicable (Number in low income)'].mean()
low_std = comm_df['Population in private households to whom low income concepts are applicable (Number in low income)'].std()

trans_mean = ((comm_df['Employed']/comm_df["Employed labour force aged 15 years and over in private households"])*comm_df['Public transit']).mean()
trans_std = ((comm_df['Employed']/comm_df["Employed labour force aged 15 years and over in private households"])*comm_df['Public transit']).std()

rent_mean = (comm_df['Per cent households with income spending 30% or more total income on shelter (Renter)']*comm_df['Private households with total income greater than zero (Renter)']).mean()
rent_std = (comm_df['Per cent households with income spending 30% or more total income on shelter (Renter)']*comm_df['Private households with total income greater than zero (Renter)']).std()

seniors_mean = comm_df['65 to 84 years'].mean()
seniors_std = comm_df['65 to 84 years'].std()

In [87]:
row, col = comm_df.shape
z_rows_to_append = []
for c in range(row):
    community = comm_df['Community Name'][c]
    
    pop = comm_df['Population in private households'][c]
    # pop_ss = (pop-pop_max)/(pop_max-pop_min)
    z_pop_score = (pop-pop_mean)/pop_std
    
    med = comm_df['Median household income of private households'][c]
    # med_ss = (med-medinc_max)/(medinc_max-medinc_min)
    z_med_score = (med-medinc_mean)/medinc_std
    
    trans = ((comm_df['Employed']/comm_df["Employed labour force aged 15 years and over in private households"])*comm_df['Public transit'])[c]
    # trans_ss = (trans-trans_max)/(trans_max-trans_min)
    z_trans_score = (trans-trans_mean)/trans_std
    
    rent = (comm_df['Per cent households with income spending 30% or more total income on shelter (Renter)']*comm_df['Private households with total income greater than zero (Renter)'])[c]
    # rent_ss = (rent-rent_max)/(rent_max-rent_min)
    z_rent_score = (rent-rent_mean)/rent_std
    
    low = comm_df['Population in private households to whom low income concepts are applicable (Number in low income)'][c]
    # low_ss = (low-low_max)/(low_max-low_min)
    z_low_score = (low-low_mean)/low_std

    sen = comm_df['65 to 84 years'][c]
    z_seniors_score = (sen-seniors_mean)/seniors_std

    z_result = z_trans_score+z_rent_score+z_low_score-z_med_score+z_seniors_score
    # z_result = z_trans_score+z_rent_score+z_low_score+z_seniors_score

    column_df = {"Community Name": community, "Z Score": z_result}
    columns_df = {"Community Name": community, "Low Income Index": z_low_score, "Seniors Index": z_seniors_score, "Public Transit Index": z_trans_score, "Rent Index": z_rent_score, "Median Income": z_med_score}
    z_rows_to_append.append(column_df)


Z_index_df = pd.DataFrame(z_rows_to_append)

In [82]:
Z_index_df.head()

Unnamed: 0,Community Name,Z Score
0,Abbeydale,-0.32606
1,Acadia,4.381997
2,Albert Park/Radisson Heights,2.750797
3,Altadore,-0.860095
4,Applewood Park,0.651714


In [88]:
z_max = Z_index_df['Z Score'].max()
z_min = Z_index_df['Z Score'].min()

In [89]:
print(z_max)
print(z_min)

28.006507636218434
-10.348038955884837


In [90]:
row, col = Z_index_df.shape
z_ss_rows = []
for c in range(row):
    community = Z_index_df["Community Name"][c]
    z_score = Z_index_df['Z Score'][c]

    z_ss_result = (z_score-z_min)/(z_max-z_min)

    column_df = {"Community Name": community, "Z Score": z_ss_result}
    z_ss_rows.append(column_df)
    
z_ss_index_df = pd.DataFrame(z_ss_rows)

In [91]:
z_ss_index_df.head()

Unnamed: 0,Community Name,Z Score
0,Abbeydale,0.272777
1,Acadia,0.403855
2,Albert Park/Radisson Heights,0.364817
3,Altadore,0.223933
4,Applewood Park,0.298807


In [None]:
save_to_csv(first_index_df, output_folder_path, "First Demand Index.csv")

In [92]:
save_to_csv(z_ss_index_df, output_folder_path, 'Z Standardized Demand Index (with Median Income).csv')

DataFrame saved to: /Users/sulemanbasit/Project/Calgary-Transit-Economic-Gap-Analysis/Data Sets/Analysis Data/Z Standardized Demand Index (with Median Income).csv
