In [None]:
# Config
import os

# Basic
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Monitoring
from tqdm.notebook import tqdm

# IO
from os.path import join, exists, basename, dirname, splitext, expanduser
from glob import glob

# Parallel processing
from joblib import Parallel, delayed

import pdfplumber
import geopandas as gpd
from astra.plot.utils import latexify

from matplotlib.colors import LinearSegmentedColormap

# Create a custom colormap that starts from white
reds_fixed = LinearSegmentedColormap.from_list("RedsFixed", ["white", *plt.get_cmap("Reds")(np.linspace(0, 1, 256))])

## Run from here to generate the CSV


In [None]:
tables = []

with pdfplumber.open("UPPCB-report-brick-kilns-March-2023.pdf") as pdf:
    for page in tqdm(pdf.pages):
        table = page.extract_table()
        if table:
            tables.append(table)
len(tables)

In [None]:
import pandas as pd

def process_table(table, start):
    df = pd.DataFrame(table[3:], columns=table[2])
    assert df.columns[0] == "Sl No"
    assert int(df["Sl No"].iloc[0]) == start, f"Expected {start}, got {df['Sl No'].iloc[0]}"
    end = df['Sl No'].iloc[-1]
    return df, int(end)

end = 0
df_list = []
for i in tqdm(range(len(tables))):
    df, end = process_table(tables[i], end+1)
    df_list.append(df)

In [None]:
df = pd.concat(df_list)
df.head(2)

In [None]:
df['Production\nCapacity\n(per day)'].str.replace(",", "").astype(int).sum() / 1000000

In [None]:
df['Year of\nEstablishment'].apply(lambda x: x.replace(",", "").split(".")[-1].split("/")[-1]).value_counts()[:20]

In [None]:
pd.concat(df_list).District.value_counts().sort_index()[-20:]

In [None]:
count_df = pd.concat(df_list).District.value_counts().reset_index()

In [None]:
pd.concat(df_list).District.value_counts().sum()

In [None]:
# Drop NaNs
print(len(count_df))
count_df.dropna(inplace=True, subset=["District"])
count_df = count_df[count_df.District != ""]
print(len(count_df))

# Add missing districts
count_df.loc[len(count_df)] = ["Banda", 0]
count_df.loc[len(count_df)] = ["Lalitpur", 0]
count_df.loc[len(count_df)] = ["Jhansi", 0]
print(len(count_df))
# Capitalize
count_df.District = count_df.District.str.capitalize()
count_df.District = count_df.District.apply(lambda x: " ".join([i.capitalize() for i in x.split()]))

print(sorted(count_df.District.tolist(), key=lambda x: x.lower()))

District names corrected as per gov website: https://up.gov.in/en/page/districts


In [None]:
district_mapping = {"Ambedkarnagar": "Ambedkar Nagar", "Badaun": "Budaun", "Bijnore": "Bijnor", "Bulandshar": "Bulandshahar", "Forozabad": "Firozabad", "Kushinagar": "Kushi Nagar", "Lakhimpur": "Lakhimpur Kheri", "Muzaffarnagar": "Muzaffar Nagar", "Plibhit": "Pilibhit"}
count_df.District = count_df.District.replace(district_mapping)
print(sorted(count_df.District.tolist()))

In [None]:
count_df = count_df.groupby("District").sum().sort_values("District").reset_index()
count_df.head(2)

In [None]:
count_df.to_csv("../data/brick_kilns_district_counts.csv", index=False)

## Run from here to fix the shapefile


In [None]:
count_df = pd.read_csv("../data/brick_kilns_district_counts.csv")
len(count_df)

In [None]:
count_df['count'].sum()

In [None]:
up_districts = gpd.read_file("../regions/shapes/uttar_pradesh_district.geojson")
len(up_districts)

In [None]:
print(set(up_districts.Name) - set(count_df.District))
print(set(count_df.District) - set(up_districts.Name))

In [None]:
mapping = {"Allahabad": "Prayagraj", "Muzaffarnagar": "Muzaffar Nagar", "Shrawasti": "Shravasti", "Sant Ravi Das Nagar(bhadohi)": "Bhadohi", "Bulandshahr": "Bulandshahar", "Kushinagar": "Kushi Nagar", "Faizabad": "Ayodhya", "Mahamaya Nagar": "Hathras", "Rae Bareli": "Raebareli"}
up_districts.Name = up_districts.Name.replace(mapping)
print(set(up_districts.Name) - set(count_df.District))
print(set(count_df.District) - set(up_districts.Name))

In [None]:
merged_df = pd.merge(up_districts, count_df, left_on="Name", right_on="District", how="inner").drop(columns=["District"]).rename(columns={"count": "survey_count"})

In [None]:
latexify(width=5.9/3, height=1.5, font_size=6)
fig, ax = plt.subplots()

p95 = merged_df.survey_count.quantile(0.95)
print(f"{p95=}")
plot = merged_df.plot(column="survey_count", ax=ax, cmap=reds_fixed, vmin=0, vmax=p95, edgecolor="black", linewidth=0.1)
plot.set_axis_off()
cbar = fig.colorbar(plot.collections[0], ax=plot)
cbar.set_label("Count")
plt.tight_layout()
fig.savefig("../figures/brick_kilns_survey_counts.pdf")
fig.savefig("../figures/brick_kilns_survey_counts.png", dpi=300)

In [None]:
our_labels = gpd.read_file("../final_data/labels/uttar_pradesh.geojson").to_crs(4326)
print(len(our_labels))
our_labels.head(2)

In [None]:
our_counts_df = gpd.sjoin(merged_df, our_labels, predicate="intersects", how="left").drop_duplicates("index_right").Name_left.value_counts().reset_index()
print(len(our_counts_df), our_counts_df['count'].sum())
our_counts_df.head(2)

In [None]:
for district in set(merged_df.Name) - set(our_counts_df.Name_left):
    our_counts_df.loc[len(our_counts_df)] = [district, 0]
print(len(our_counts_df), our_counts_df['count'].sum())
our_counts_df.head(2)

In [None]:
latexify(width=5.9/3, height=1.5, font_size=6)
fig, ax = plt.subplots()

print(f"{p95=}")
plot = merged_df.merge(our_counts_df, left_on="Name", right_on="Name_left").plot(column="count", ax=ax, cmap=reds_fixed, vmin=0, vmax=p95, edgecolor="black", linewidth=0.1)
plot.set_axis_off()
cbar = fig.colorbar(plot.collections[0], ax=plot)
cbar.set_label("Count")
plt.tight_layout()
fig.savefig("../figures/brick_kilns_our_counts.pdf")
fig.savefig("../figures/brick_kilns_our_counts.png", dpi=300)

In [None]:
import seaborn as sns

latexify(width=5.9/3, height=1.5, font_size=6)
plt.figure()
sorted_survey_counts = merged_df.sort_values("survey_count", ascending=True)

survey_counts = sorted_survey_counts.survey_count.tolist()
our_counts = our_counts_df.set_index("Name_left").loc[sorted_survey_counts.Name]['count'].tolist()

plt.plot(survey_counts, our_counts, 'o', markersize=1)
plt.plot([0, max(survey_counts)], [0, max(survey_counts)], 'k--', linewidth=0.5, label="y=x")
plt.xlabel("Survey count")
plt.ylabel("Our count")
sns.despine()
plt.legend()
plt.tight_layout()
plt.savefig("../figures/brick_kilns_comparison.pdf")
plt.savefig("../figures/brick_kilns_comparison.png", dpi=300)

In [None]:
errors = np.abs(np.array(survey_counts) - np.array(our_counts))
mean_error = np.mean(errors)
median_error = np.median(errors)
std_error = np.std(errors)
print(f"{mean_error=}, {std_error=}, {median_error=}")

In [None]:
np.corrcoef(survey_counts, our_counts)

In [None]:
pd.DataFrame({"survey_count": survey_counts, "our_count": our_counts}).corr()