# CA average income
> Explain what this notebook does specifically and link to the data source. 

---

#### Import Python tools and Jupyter config

In [97]:
import json
import pandas as pd
import jupyter_black

In [3]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = None

---

## Subhead, i.e. "Fetch"

#### Read data from XyXy source (live data where possible over local downloads)

In [125]:
excel_file = "https://www.irs.gov/pub/irs-soi/18zp05ca.xlsx"

In [159]:
src = (
    (
        pd.read_excel(excel_file, skiprows=5)[["Unnamed: 0", "Unnamed: 1", -18, -19]]
        .rename(
            columns={
                "Unnamed: 0": "zip",
                "Unnamed: 1": "category",
                -18: "number_of_returns",
                -19: "total_income_amount",
            }
        )
        .dropna(thresh=2)
    )
    .query("category.isna()")
    .drop(["category"], axis=1)
)

In [160]:
src.head(20)

Unnamed: 0,zip,number_of_returns,total_income_amount
8,90001,22030,698009
16,90002,20590,645762
24,90003,27900,830588
32,90004,27950,2220715
40,90005,16480,959985
48,90006,23220,787550
56,90007,12680,426480
64,90008,15050,880903
72,90010,2740,511606
80,90011,39030,1167324


In [161]:
src[["total_income_amount", "number_of_returns"]] = src[
    ["total_income_amount", "number_of_returns"]
].astype(int)

In [162]:
src["avg_income"] = (src.total_income_amount / src.number_of_returns).round() * 1000

In [163]:
df = src[["zip", "avg_income"]].copy()

In [164]:
df.head()

Unnamed: 0,zip,avg_income
8,90001,32000.0
16,90002,31000.0
24,90003,30000.0
32,90004,79000.0
40,90005,58000.0


In [None]:
df["number_of_returns"] = (
    df["number_of_returns"].str.replace(",", "").astype(int)
) * 1000

In [None]:
df["total_income_amount"] = (
    df["total_income_amount"].str.replace(",", "").astype(int)
) * 1000

In [81]:
df["avg_income"] = (df.total_income_amount / df.number_of_returns).round()

In [82]:
df

Unnamed: 0,zip_code,number_of_returns,total_income_amount,avg_income
8,90001,22030000,698009000,32.0
16,90002,20590000,645762000,31.0
24,90003,27900000,830588000,30.0
32,90004,27950000,2220715000,79.0
40,90005,16480000,959985000,58.0
...,...,...,...,...
11784,96137,1200000,75596000,63.0
11792,96146,720000,90393000,126.0
11800,96150,8900000,536581000,60.0
11808,96161,6180000,777668000,126.0


In [None]:
# # Load the spreadsheet data into a DataFrame
# file_path = 'Total_Income_California_IRS.csv'  # Update with your actual fil2e path
# df = pd.read_csv(file_path)
# df2 = df.iloc[::8, :].copy()
# df2['total_income_amount'] = df2['total_income_amount'].str.replace(',', '')
# df2['number_of_returns']= df2['number_of_returns'].str.replace(',', '')
# df2['total_income_amount'] = pd.to_numeric(df2['total_income_amount'], errors='coerce')
# df2['number_of_returns'] = pd.to_numeric(df2['number_of_returns'], errors='coerce')
# df2['Average'] = df2.total_income_amount/df2.number_of_returns
# print(df2)
# df2.to_csv("Average_Income_By_ZIP_California.csv", index=False)

---

## Process

#### Clean dates, standardize categories, etc. 

---

## Aggregate

#### Groupby state, etc.

In [167]:
import requests
import pandas as pd

dates = ['2024-01-01', '2024-01-02', '2024-01-03'] 

store_dfs = []

for date in dates:
    params = {
        "validdate": yesterday,
        "var": "maxt", # max temperature
        "thresh": "climper",
        "period": "1_DAY",
        "map_display": "dfn",  # departure from norm. can also use "all"
        "showthrdx": "true",  # false
        "showcoop": "true",  # false
        "domain": "conus",  # us shows alaska, pr and hawaii
    }
    
    response = requests.get(
        "https://sercc.oasis.unc.edu/climpermap_json.php",
        params=params,
        headers=headers,
    )
    
    json_data = response.json()["data"]
    
    src = (
        pd.DataFrame.from_dict(json_data, orient="index")
        .assign(date=date)[
            [
                "id",
                "city",
                "state",
                "value",
                "rank",
                "ranktext",
                "dfnlabel",
                "dfn",
                "lat",
                "lon",
                "date",
            ]
        ]
        .rename(
            columns={
                "id": "station_id",
                "value": "himax",
                "dfn": "diff_normal",
                "dfnlabel": "diff_normal_period",
            }
        )
    ).reset_index(drop=True)

    store_dfs.append(src)

all_dates_df = pd.concat(store_dfs).reset_index(drop=True)

NameError: name 'yesterday' is not defined

---

## Charts

#### Save the chart

In [None]:
chart.save("visuals/chart.png")
Image(filename="visuals/chart.png")

#### Make sure the chart is visible on Github

In [None]:
Image(filename="visuals/chart.png")

---

## Metadata

#### Data provenance, column descriptions, etc.

In [None]:
import json
import pandas as pd
import jupyter_black

In [114]:
headers = {
    "accept": "application/json, text/javascript, */*; q=0.01",
    "content-type": "application/json; charset=utf-8",
    "origin": "https://www.4coffshore.com",
    "referer": "https://www.4coffshore.com/transmission/interconnectors.aspx",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    "x-requested-with": "XMLHttpRequest",
}

response = requests.post(
    "https://www.4coffshore.com/transmission/interconnectors.aspx/getallinterconnectors",
    headers=headers,
)

In [116]:
connector_list = json.loads(response.json()["d"])
connector_df = pd.DataFrame(connector_list)

In [117]:
connector_df.to_json(
    "data/processed/connectors_4coffshore.json", indent=4, orient="records"
)
connector_df.to_csv("data/processed/connectors_4coffshore.csv", index=False)

In [168]:
response.json()



---

## Exports

#### XyXy subset in CSV format to `processed`

#### JSON, GeoJSON, etc., to `processed`