Let's download the data from INSEE.

In [1]:
import urllib.request
import os

url = "https://www.insee.fr/fr/statistiques/fichier/4648335/bdf2017fe_csv.zip"
filename = "bdf2017fe_csv.zip"

# Define the destination directory
dest_dir = 'already_downloaded_data'

# Create the destination directory if it doesn't exist
if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)

# Join the destination directory with the filename to create the full destination path
dest_path = os.path.join(dest_dir, filename)

# Download the file and save it to the destination path
urllib.request.urlretrieve(url, dest_path)


('already_downloaded_data\\bdf2017fe_csv.zip',
 <http.client.HTTPMessage at 0x1cc699dcdc0>)

In [2]:
import zipfile

# Extract the contents of the zip file to the destination directory
with zipfile.ZipFile(dest_path, 'r') as zip_ref:
    zip_ref.extractall(dest_dir)



In [3]:
csv_filename = "tf206.csv"
csv_filepath = os.path.join(dest_dir, csv_filename)

In [4]:
import pandas as pd

df = pd.read_csv(csv_filepath, sep=";")

In [5]:
# Define a list of valid values for the NOMENCLATURE2 column, only keep the broad categories
valid_nomenclature2_values = [str(i).zfill(2) for i in range(1, 13)]

# Filter the DataFrame to keep only rows where the NOMENCLATURE2 column has a valid value
df_filtered = df[df['NOMENCLATURE2'].isin(valid_nomenclature2_values)]

In [None]:
import matplotlib.pyplot as plt

# Define the colors to use for each NOMENCLATURE2 value
colors = {
    '01': 'red',
    '02': 'blue',
    '03': 'green',
    '04': 'purple',
    '05': 'orange',
    '06': 'yellow',
    '07': 'gray',
    '08': 'cyan',
    '09': 'magenta',
    '10': 'lime',
    '11': 'pink',
    '12': 'teal',
    'nan': 'lightgray'  # Replace NaN values with a default color
}

# Define the colors to use for each NOMENCLATURE2 value
colors = {
    '01': 'red',
    '02': 'blue',
    '03': 'green',
    '04': 'purple',
    '05': 'orange',
    '06': 'yellow',
    '07': 'gray',
    '08': 'cyan',
    '09': 'magenta',
    '10': 'lime',
    '11': 'pink',
    '12': 'teal',
    'nan': 'lightgray'  # Replace NaN values with a default color
}

# Define the names of NOMENCLATURE2 values
names = {
    '01': 'PRODUITS ALIMENTAIRES ET BOISSONS NON-ALCOOLISEES',
    '02': 'BOISSONS ALCOOLISEES',
    '03': 'TABACS',
    '04': 'HABILLEMENT ET CHAUSSURES',
    '05': 'EQUIPEMENTS DU FOYER',
    '06': 'SANTÉ',
    '07': 'TRANSPORTS ET COMMUNICATIONS',
    '08': 'LOISIRS, CULTURE ET ENSEIGNEMENT',
    '09': 'SERVICES DIVERS',
    '10': 'ADMINISTRATION GENERALE ET TERRITORIALE',
    '11': 'PRODUCTION ET DISTRIBUTION D ELECTRICITE, DE GAZ ET DE CHALEUR',
    '12': 'INDUSTRIES AGRICOLES ET ALIMENTAIRES'
}

# Replace the column names with their corresponding names from the names dictionary
grouped = df_filtered.groupby(['DECUC', 'NOMENCLATURE2']).sum()['COEFF_BUDG'].unstack()
grouped.columns = grouped.columns.map(names)

# Define the order of DECUC values
decuc_order = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'TOT']

# Reorder the index of the grouped DataFrame using decuc_order
grouped = grouped.reindex(decuc_order)

# Map the colors of the bars using the colors dictionary
colors_mapped = grouped.columns.map(colors)

# Create a stacked bar chart using the grouped data and colors
ax = grouped.plot(kind='bar', stacked=True)

# Set the x-axis label
ax.set_xlabel('DECUC')

# Set the y-axis label
ax.set_ylabel('COEFF_BUDG')

# Set the title of the chart
ax.set_title('Budget coefficients by DECUC and NOMENCLATURE2')

# Set the legend labels using the names dictionary
handles, labels = ax.get_legend_handles_labels()
labels = [names[label.split(' - ')[0]] for label in labels]
ax.legend(handles, labels)


# Show the chart
plt.show()
