In [None]:
import warnings

warnings.simplefilter("ignore")

In [None]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd

In [None]:
medical_file = Path("../data/medical_data.xlsx")
excel_file = Path("../data/base-ic-evol-struct-pop-2019.xlsx")
surface_file = Path("../data/insee_rp_hist_1968.xlsx")
geojson_file = Path("../data/communes.geojson")

In [None]:
# Upload geojson data for each com

gdf = gpd.read_file(geojson_file)
gdf.head(5)

In [None]:
statistics = pd.read_excel(excel_file, sheet_name=0, skiprows=5)
grand_est_stats = statistics[statistics["REG"] == 44]

grand_est_stats.head(5)

In [None]:
elder_population_columns = [
    "COM",
    "LIBCOM",
    "P19_POP",
    "P19_POP5564",
    "P19_POP6579",
    "P19_POP80P",
]

grand_est_stats = (
    grand_est_stats[elder_population_columns]
    .groupby("COM")
    .agg(
        {
            "LIBCOM": "first",
            "P19_POP": "sum",
            "P19_POP5564": "sum",
            "P19_POP6579": "sum",
            "P19_POP80P": "sum",
        }
    )
    .reset_index()
)

grand_est_stats.head(5)

In [None]:
medical_data = pd.read_excel(medical_file, sheet_name=0, skiprows=3)
numerical_columns = [
    "Médiane du niveau de vie 2021",
    "Hébergement des personnes âgées (en nombre) 2021",
    "Infirmier (en nombre) 2021",
]

for column in numerical_columns:
    medical_data[column] = pd.to_numeric(medical_data[column], errors="coerce")
    medical_data.dropna(subset=[column], inplace=True)

medical_data.head(5)

In [None]:
surface = pd.read_excel(surface_file, sheet_name=0, skiprows=4)
numerical_columns = ["an", "superf_choro"]

for column in numerical_columns:
    surface[column] = pd.to_numeric(surface[column], errors="coerce")
    surface.dropna(subset=[column], inplace=True)

surface = surface.loc[surface.groupby("codgeo")["an"].idxmax()]
surface.head(5)

In [None]:
df = pd.merge(
    grand_est_stats, medical_data, left_on="COM", right_on="Code", how="inner"
)
df = pd.merge(df, surface, left_on="COM", right_on="codgeo", how="inner")
df.head(5)

### Question#1: Based on available public data, estimate the addressable market (in terms of number of potential clients) in each city, and the density of the number of potential clients

In [None]:
# Coefficient of importance of long-run target population effect

LR_POPULATION_COEF = 0.2

In [None]:
df.drop(["Code", "Libellé", "codgeo", "libgeo", "an"], axis=1, inplace=True)

# Create key features columns: target_population, density and solvency
df["target_population"] = df["P19_POP80P"]
+LR_POPULATION_COEF * (df["P19_POP6579"] + df["P19_POP5564"])
df["density"] = df["target_population"] / df["superf_choro"]
df["solvency"] = pd.to_numeric(
    df["Médiane du niveau de vie 2021"], errors="coerce"
)

df.head(5)

In [None]:
df = gdf.merge(df, left_on="code", right_on="COM")
df.sort_values(by="target_population", ascending=False).head(5)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 10))

df.plot(
    column="target_population",
    cmap="Greens",
    linewidth=0.8,
    ax=ax,
    edgecolor="0.8",
    legend=True,
)
plt.title("Targer Population", fontsize=16)
ax.set_axis_off()
plt.show()

df["target_population_log"] = np.log1p(df["target_population"])
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
df.plot(
    column="target_population_log",
    cmap="Greens",
    linewidth=0.8,
    ax=ax,
    edgecolor="0.8",
    legend=True,
)
plt.title("Log-transformed Target Population", fontsize=16)
ax.set_axis_off()
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(15, 10))
df.plot(
    column="density",
    cmap="Reds",
    linewidth=0.8,
    ax=ax,
    edgecolor="0.8",
    legend=True,
)
plt.title("Density", fontsize=16)
ax.set_axis_off()
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(15, 10))
df.plot(
    column="solvency",
    cmap="Blues",
    linewidth=0.8,
    ax=ax,
    edgecolor="0.8",
    legend=True,
)
plt.title("Solvency", fontsize=16)
ax.set_axis_off()
plt.show()

It's obvious that Strasbourg, Reims, Metz, Mulhouse and Nancy are outliers as the lasrgest cities in Grand Est. They make it difficult to do the further analysis. So we would remove it from the dataset and return to them later

In [None]:
df = df.sort_values(by="target_population", ascending=False)
df_without_top_15 = df.iloc[15:]

fig, ax = plt.subplots(1, 1, figsize=(15, 10))
df_without_top_15.plot(
    column="target_population",
    cmap="Greens",
    linewidth=0.8,
    ax=ax,
    edgecolor="0.8",
    legend=True,
)

plt.title("Targer Population in Cities", fontsize=16)
ax.set_axis_off()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Boxplots for target_population, density and solvency
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

sns.boxplot(data=df["target_population"], ax=axes[0])
axes[0].set_title("Boxplot - Target Population")

sns.boxplot(data=df["density"], ax=axes[1])
axes[1].set_title("Boxplot - Density")

sns.boxplot(data=df["solvency"], ax=axes[2])
axes[2].set_title("Boxplot - Solvency")

plt.tight_layout()
plt.show()


# Boxplots for target_population, density and solvency without top 15 cities
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

sns.boxplot(
    data=df_without_top_15[["target_population", "density", "solvency"]][
        "target_population"
    ],
    ax=axes[0],
)
axes[0].set_title("Boxplot - Target Population")

sns.boxplot(
    data=df_without_top_15[["target_population", "density", "solvency"]][
        "density"
    ],
    ax=axes[1],
)
axes[1].set_title("Boxplot - Density")

sns.boxplot(
    data=df_without_top_15[["target_population", "density", "solvency"]][
        "solvency"
    ],
    ax=axes[2],
)
axes[2].set_title("Boxplot - Solvency")

plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
columns_to_scale = ["target_population", "density", "solvency"]

for column in columns_to_scale:
    new_column_name = column + "_norm"
    df_without_top_15[new_column_name] = scaler.fit_transform(
        df_without_top_15[[column]]
    )

df_without_top_15.head(5)

In [None]:
weights = {"target_population": 0.7, "density": 0.2, "solvency": 0.1}

df_without_top_15["addressable_market_score"] = (
    df_without_top_15["target_population"] * weights["target_population"]
    + df_without_top_15["density"] * weights["density"]
    + df_without_top_15["solvency"] * weights["solvency"]
)
addressable_market_ranking = df_without_top_15.sort_values(
    by="addressable_market_score", ascending=False
)

scaler = MinMaxScaler()

df_without_top_15[["addressable_market_score"]] = scaler.fit_transform(
    df_without_top_15[["addressable_market_score"]]
)

fig, ax = plt.subplots(1, 1, figsize=(15, 10))
df_without_top_15.plot(
    column="addressable_market_score",
    cmap="Greens",
    linewidth=0.8,
    ax=ax,
    edgecolor="0.8",
    legend=True,
)

ax.set_axis_off()
plt.show()

In [None]:
df.to_csv("../data/df_question_1.csv", index=False)
df_without_top_15.to_csv(
    "../data/df_without_top_15_question_1.csv", index=False
)