In [6]:
import os
import json5
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# data_dir = './data/reduced_mixed_3/reduced_mixed_3'

data_dir = './dump_data'
all_files = os.listdir(data_dir)
print(f"All files: {all_files}")


# Verify the content and structure of the files
data_entries = []
file_count = 0

for file_name in all_files:
    if file_name.startswith('sdgIndicatorData__') and file_name.endswith('.json5'):
        file_path = os.path.join(data_dir, file_name)
        file_count += 1
        with open(file_path, 'r') as f:
            try:
                data = json5.load(f)
                print(f"Loaded data from {file_name}: {data.keys()}")  # Print the keys to verify structure
                if 'data' in data:
                    data_entries.extend(data['data'])
            except Exception as e:
                print(f"Error loading {file_path}: {e}")

print(f"Total files processed: {file_count}")
print(f"Total data entries loaded: {len(data_entries)}")
if len(data_entries) > 0:
    print(json5.dumps(data_entries[0], indent=2))
else:
    print("No data entries loaded.")



All files: ['indicatorDataMeta.json5', 'sdgIndicatorData__1.1.1.json5', 'sdgIndicatorData__1.2.1.json5', 'sdgIndicatorData__1.2.2.json5']
Loaded data from sdgIndicatorData__1.1.1.json5: dict_keys(['size', 'totalElements', 'totalPages', 'pageNumber', 'data'])
Loaded data from sdgIndicatorData__1.2.1.json5: dict_keys(['size', 'totalElements', 'totalPages', 'pageNumber', 'data'])
Loaded data from sdgIndicatorData__1.2.2.json5: dict_keys(['size', 'totalElements', 'totalPages', 'pageNumber', 'data'])
Total files processed: 3
Total data entries loaded: 9
{
  goal: [
    "1",
  ],
  target: [
    "1.1",
  ],
  indicator: [
    "1.1.1",
  ],
  series: "SI_POV_DAY1",
  seriesDescription: "Proportion of population below international poverty line (%)",
  seriesCount: "8748",
  geoAreaCode: "32",
  geoAreaName: "Argentina",
  timePeriodStart: 2020,
  value: "1.6",
  valueType: "Float",
  time_detail: null,
  timeCoverage: null,
  upperBound: null,
  lowerBound: null,
  basePeriod: null,
  source:

In [7]:
# Convert to DataFrame
df = pd.DataFrame(data_entries)

print(df)


# Convert 'value' column to numeric
df['value'] = pd.to_numeric(df['value'], errors='coerce')

# Display the DataFrame
print(df.head())


  goal target indicator         series  \
0  [1]  [1.1]   [1.1.1]    SI_POV_DAY1   
1  [1]  [1.1]   [1.1.1]    SI_POV_EMP1   
2  [1]  [1.1]   [1.1.1]    SI_POV_EMP1   
3  [1]  [1.2]   [1.2.1]    SI_POV_NAHC   
4  [1]  [1.2]   [1.2.1]    SI_POV_NAHC   
5  [1]  [1.2]   [1.2.1]    SI_POV_NAHC   
6  [1]  [1.2]   [1.2.2]    SD_MDP_MUHC   
7  [1]  [1.2]   [1.2.2]    SD_MDP_MUHC   
8  [1]  [1.2]   [1.2.2]  SD_MDP_ANDIHH   

                                   seriesDescription seriesCount geoAreaCode  \
0  Proportion of population below international p...        8748          32   
1  Employed population below international povert...       32396          50   
2  Employed population below international povert...       32396         716   
3  Proportion of population living below the nati...         912           4   
4  Proportion of population living below the nati...         912         450   
5  Proportion of population living below the nati...         912         716   
6  Proportion of po

In [8]:

# Extract the 'Sex' value from 'dimensions'
df['dim_sex'] = df['dimensions'].apply(lambda x: x.get('Sex') if isinstance(x, dict) else None)

# Extract the 'Age' value from 'dimensions'
df['dim_age'] = df['dimensions'].apply(lambda x: x.get('Age') if isinstance(x, dict) else None)

# Extract the 'Reporting Type' value from 'dimensions'
df['dim_Reporting Type']=  df['dimensions'].apply(lambda x: x.get('Reporting Type') if isinstance(x, dict) else None)

# Extract the 'Location' value from 'dimensions'
df['dim_location']=  df['dimensions'].apply(lambda x: x.get('Location') if isinstance(x, dict) else None)

# Extract the 'Nature' value from 'attributes'
df['att_nature']=  df['attributes'].apply(lambda x: x.get('nature') if isinstance(x, dict) else None)

# Extract the 'Units' value from 'attributes'
df['att_nature']=  df['attributes'].apply(lambda x: x.get('Units') if isinstance(x, dict) else None)

# Extract the 'Observations' value from 'attributes'
df['att_nature']=  df['attributes'].apply(lambda x: x.get('Observations') if isinstance(x, dict) else None)


In [9]:


df = df.drop(columns=['dimensions','attributes','dim_Reporting Type'])

# Flatten nested columns
df['goal'] = df['goal'].apply(lambda x: x[0] if isinstance(x, list) else x)
df['target'] = df['target'].apply(lambda x: x[0] if isinstance(x, list) else x)
df['indicator'] = df['indicator'].apply(lambda x: x[0] if isinstance(x, list) else x)
df['key'] = df['key'].apply(lambda x: x[0] if isinstance(x, list) else x)

# Encode categorical variables
categorical_columns = ['goal', 'target', 'indicator', 'series', 'geoAreaName', 'valueType']
df[categorical_columns] = df[categorical_columns].astype('category')


print(df)
df = pd.get_dummies(df, columns=categorical_columns)

# Handle missing values
df = df.fillna(0)


  goal target indicator         series  \
0    1    1.1     1.1.1    SI_POV_DAY1   
1    1    1.1     1.1.1    SI_POV_EMP1   
2    1    1.1     1.1.1    SI_POV_EMP1   
3    1    1.2     1.2.1    SI_POV_NAHC   
4    1    1.2     1.2.1    SI_POV_NAHC   
5    1    1.2     1.2.1    SI_POV_NAHC   
6    1    1.2     1.2.2    SD_MDP_MUHC   
7    1    1.2     1.2.2    SD_MDP_MUHC   
8    1    1.2     1.2.2  SD_MDP_ANDIHH   

                                   seriesDescription seriesCount geoAreaCode  \
0  Proportion of population below international p...        8748          32   
1  Employed population below international povert...       32396          50   
2  Employed population below international povert...       32396         716   
3  Proportion of population living below the nati...         912           4   
4  Proportion of population living below the nati...         912         450   
5  Proportion of population living below the nati...         912         716   
6  Proportion of po

In [10]:

# # Define the target and features
target = 'value'
features = df.columns.difference([target])

# # Convert target to numeric
df[target] = pd.to_numeric(df[target], errors='coerce')

print(df)

# # Split the data
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)


print(X_train)
# # # Train the model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)


# print(X_test)
# Predict and evaluate
y_pred = model.predict(X_test)

print(y_pred)
# mse = mean_squared_error(y_test, y_pred)
# print("Mean Squared Error:", mse)

                                   seriesDescription seriesCount geoAreaCode  \
0  Proportion of population below international p...        8748          32   
1  Employed population below international povert...       32396          50   
2  Employed population below international povert...       32396         716   
3  Proportion of population living below the nati...         912           4   
4  Proportion of population living below the nati...         912         450   
5  Proportion of population living below the nati...         912         716   
6  Proportion of population living in multidimens...        2380           4   
7  Proportion of population living in multidimens...        2380         428   
8  Average share of weighted deprivations of tota...          48         275   

   timePeriodStart  value  time_detail  timeCoverage  upperBound  lowerBound  \
0             2020    1.6            0             0           0           0   
1             2020    5.1            0 

ValueError: could not convert string to float: 'ALLAGE'

In [5]:

# # Assuming your data is in a pandas DataFrame called df
# df = pd.DataFrame(data_entries)

print(df)

# Preprocess data: handle missing values, encode categorical variables, etc.
# For simplicity, let's assume all preprocessing is done, and df is ready.

# df = df.drop(columns=['seriesDescription', 'source', 'geoInfoUrl', 'footnotes'])


# # Define features and label
# X = df.drop(columns=['value'])  # Features
# y = df['value'].astype(float)   # Label

# # # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # print(X_train)
# # print("\n\n\n",X_test)
# # # Train a Decision Tree Regressor
# model = DecisionTreeRegressor(random_state=42)
# model.fit(X_train, y_train)

# # Predict on the test set
# y_pred = model.predict(X_test)

# # Evaluate the model
# mse = mean_squared_error(y_test, y_pred)
# print(f'Mean Squared Error: {mse}')

# # Analyze feature importance
# feature_importances = model.feature_importances_
# feature_names = X.columns
# feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
# print(feature_importance_df.sort_values(by='importance', ascending=False))


    goal target indicator         series  \
0    [1]  [1.1]   [1.1.1]    SI_POV_DAY1   
1    [1]  [1.1]   [1.1.1]    SI_POV_EMP1   
2    [1]  [1.1]   [1.1.1]    SI_POV_EMP1   
3    [1]  [1.2]   [1.2.1]    SI_POV_NAHC   
4    [1]  [1.2]   [1.2.1]    SI_POV_NAHC   
..   ...    ...       ...            ...   
691  [9]  [9.b]   [9.b.1]    NV_IND_TECH   
692  [9]  [9.b]   [9.b.1]    NV_IND_TECH   
693  [9]  [9.c]   [9.c.1]  IT_MOB_2GNTWK   
694  [9]  [9.c]   [9.c.1]  IT_MOB_3GNTWK   
695  [9]  [9.c]   [9.c.1]  IT_MOB_4GNTWK   

                                     seriesDescription seriesCount  \
0    Proportion of population below international p...        8748   
1    Employed population below international povert...       32396   
2    Employed population below international povert...       32396   
3    Proportion of population living below the nati...         912   
4    Proportion of population living below the nati...         912   
..                                                 