# Project 1

I will be working with data NYC Open Data to visualize data on the death rates of female New Yorkers in 2021.

More information on this data can be found at the link below:

https://data.cityofnewyork.us/Health/New-York-City-Leading-Causes-of-Death/jb7j-dtam/data_preview

### Import, Clean, and Read Data

In [None]:
import pandas as pd

df_NYC = pd.read_csv("NYC_Leading_Causes_of_Death.csv")

df_women_2021_deaths = df_NYC[(df_NYC["Sex"] == "Female") & (df_NYC["Year"] == 2021)]

df_women_2021_deaths

Unnamed: 0,Year,Leading Cause,Sex,Race Ethnicity,Deaths,Death Rate,Age Adjusted Death Rate
1,2021,Alzheimer's Disease (G30),Female,Not Stated/Unknown,7,,
2,2021,"Diseases of Heart (I00-I09, I11, I13, I20-I51)",Female,Not Stated/Unknown,113,,
11,2021,Cerebrovascular Disease (Stroke: I60-I69),Female,Not Stated/Unknown,17,,
14,2021,Malignant Neoplasms (Cancer: C00-C97),Female,Non-Hispanic Black,1621,161.9,118.6
17,2021,All Other Causes,Female,Other Race/ Ethnicity,71,,
...,...,...,...,...,...,...,...
128,2021,Mental and Behavioral Disorders due to Acciden...,Female,Non-Hispanic White,189,13.8,13.1
129,2021,All Other Causes,Female,Non-Hispanic White,2670,195.4,116.8
130,2021,"Diseases of Heart (I00-I09, I11, I13, I20-I51)",Female,Non-Hispanic Black,2356,235.4,162.7
134,2021,Covid-19,Female,Not Stated/Unknown,33,,


### Calculate Mean, Median, and Mode with Pandas

In [None]:
import pandas as pd

# make of copy of dataframe
df_women_2021_deaths = df_NYC[
    (df_NYC["Sex"] == "Female") & (df_NYC["Year"] == 2021)
].copy()

# clean 'Deaths' and 'Death Rate' columns, convert to numeric, and convert all empty cells to NaN
df_women_2021_deaths["Deaths"] = pd.to_numeric(
    df_women_2021_deaths["Deaths"], errors="coerce"
)
df_women_2021_deaths["Death Rate"] = pd.to_numeric(
    df_women_2021_deaths["Death Rate"], errors="coerce"
)

# calculate and print mean
mean_value = df_women_2021_deaths[["Deaths", "Death Rate"]].mean()
print("Mean:\n", mean_value)

# calculate and print median
median_value = df_women_2021_deaths[["Deaths", "Death Rate"]].median()
print("\nMedian:\n", median_value)

# calculate and print mode
mode_value = df_women_2021_deaths[["Deaths", "Death Rate"]].mode().iloc[0]
print("\nMode:\n", mode_value)

Mean:
 Deaths        443.637681
Death Rate     60.422727
dtype: float64

Median:
 Deaths        169.00
Death Rate     23.95
dtype: float64

Mode:
 Deaths        7.0
Death Rate    6.0
Name: 0, dtype: float64


### Calculate Mean, Median, and Mode with Python

In [None]:
# import csv
import csv

filename = "NYC_Leading_Causes_of_Death.csv"

# seperate out two variables
deaths = []
death_rates = []

# open csv and account for special characters
with open(filename, newline="", encoding="utf-8") as f:
    # read csv as dictionary
    reader = csv.DictReader(f)
    for row in reader:
        # filter for women in 2021
        if row["Sex"] == "Female" and row["Year"] == "2021":
            # clean data by removing blanks and commas
            if row["Deaths"]:
                deaths.append(float(row["Deaths"].replace(",", "")))
            if row["Death Rate"]:
                death_rates.append(float(row["Death Rate"].replace(",", "")))


# calculate mean
def compute_mean(data):
    total = 0
    count = 0
    for x in data:
        total += x
        count += 1
    return total / count if count > 0 else None


mean_deaths = compute_mean(deaths)
mean_rate = compute_mean(death_rates)

print(f"Mean Female NYC Deaths, 2021:{mean_deaths}")
print(f"Mean Female NYC Death Rates, 2021:{mean_rate}")


# calculate median
def compute_median(data):
    n = len(data)
    if n == 0:
        return None
    data_sorted = sorted(data)
    mid = n // 2
    if n % 2 == 0:
        return (data_sorted[mid - 1] + data_sorted[mid]) / 2
    else:
        return data_sorted[mid]


median_deaths = compute_median(deaths)
median_rate = compute_median(death_rates)

print(f"Median Female NYC Deaths, 2021:{median_deaths}")
print(f"Median Female NYC Death Rates, 2021:{median_rate}")


# calcualte mode, this one is kind of weird because all the data points are different and not repeated
def compute_mode(data):
    counts = {}
    for x in data:
        counts[x] = counts.get(x, 0) + 1

    max_count = max(counts.values())

    if max_count == 1:
        return None

    modes = [a for a, b in counts.items() if b == max_count]
    return modes if len(modes) > 1 else modes[0]


mode_deaths = compute_mode(deaths)
mode_rate = compute_mode(death_rates)

print(f"Mode Female NYC Deaths, 2021: {mode_deaths}")
print(f"Mode Female NYC Death Rates, 2021: {mode_rate}")

print(f"Mode Female NYC Deaths, 2021:{mode_deaths}")
print(f"Mode Female NYC Death Rates, 2021:{mode_rate}")

Mean Female NYC Deaths, 2021:443.6376811594203
Mean Female NYC Death Rates, 2021:60.42272727272728
Median Female NYC Deaths, 2021:169.0
Median Female NYC Death Rates, 2021:23.95
Mode Female NYC Deaths, 2021: [7.0, 15.0]
Mode Female NYC Death Rates, 2021: None
Mode Female NYC Deaths, 2021:[7.0, 15.0]
Mode Female NYC Death Rates, 2021:None


### Data Visualization

In [None]:
import pandas as pd

# read csv file
df_NYC = pd.read_csv("NYC_Leading_Causes_of_Death.csv")

# clean column names
df_NYC.columns = df_NYC.columns.str.strip()

# filter for women in 2021 and make a copy
df_women_2021 = df_NYC[(df_NYC["Sex"] == "Female") & (df_NYC["Year"] == 2021)].copy()

# convert columns to a numeric
df_women_2021["Deaths"] = pd.to_numeric(df_women_2021["Deaths"], errors="coerce")

# select for top 10 causes by deaths
top10 = df_women_2021.sort_values("Deaths", ascending=False).head(10)

# make a bar chart with astericks
print("Top 10 Causes of Death for Women in NYC, 2021 (1 '*' = 500 deaths)\n")

# this was complicated and I had to use google
for index, row in top10.iterrows():
    # Number of stars per 500 deaths
    bar_length = int(row["Deaths"] // 500)
    bar = "*" * bar_length
    print(f"{row['Leading Cause'][:40]:<50} | {bar} {int(row['Deaths'])}")

Top 10 Causes of Death for Women in NYC, 2021 (1 '*' = 500 deaths)

Diseases of Heart (I00-I09, I11, I13, I2           | ******* 3601
All Other Causes                                   | ***** 2670
Malignant Neoplasms (Cancer: C00-C97)              | **** 2427
Diseases of Heart (I00-I09, I11, I13, I2           | **** 2356
All Other Causes                                   | *** 1993
Malignant Neoplasms (Cancer: C00-C97)              | *** 1621
All Other Causes                                   | *** 1589
Diseases of Heart (I00-I09, I11, I13, I2           | ** 1409
Malignant Neoplasms (Cancer: C00-C97)              | ** 1157
Covid-19                                           | ** 1152
