# Showcase 2 - Data Dictionaries

In this notebook, we will be working with data downloaded from the Saudi portal, specifically in CSV format. Our goal is to create a data dictionary using OpenAI's capabilities. The data dictionary will provide detailed descriptions of the data fields based on the content of the CSV files. By leveraging OpenAI, we can generate comprehensive and accurate descriptions, enhancing our understanding of the dataset and facilitating further analysis.


In [None]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import concurrent.futures
import pandas as pd
import re
load_dotenv()

openai = OpenAI(
            base_url=os.getenv("BASE_URL"),
            api_key=os.getenv("OR_API_KEY") or os.getenv("OPENAI_API_KEY")
        )


In [2]:
def list_files(base_dir, exts='.csv'):
    files_list = []
    # Walk through the directory
    for root, dirs, files in os.walk(base_dir):
        # Skip directories that contain 'data_dictionary' in their path
        if 'data_dictionary' in root.split(os.sep):
            continue
        for file in files:
            if any(file.endswith(ext) for ext in exts):
                # Append the full path of the csv file to the list
                files_list.append(os.path.join(root, file))
    return files_list

base_directory = 'opendata'
file_paths = list_files(base_directory,exts = ".csv")
#print(file_paths)


### Loading the CSVs

We want to load the CSVs to be able to get a view of the top rows including a sample of the first few values to be able to view the data and the corresponding data types.

This will allow us to create data dictionaries for the corresponding datasets using OpenAI.


In [3]:
def get_data_previews(file_paths,verbose=None):
    previews = {}
    for file_path in file_paths:
        # Determine the file extension
        file_extension = os.path.splitext(file_path)[1].lower()
        
        if file_extension == '.csv':
            # Load the CSV file into a DataFrame with error handling for encoding issues
            #print(file_path)
            for encoding in ['utf-8', 'ISO-8859-1', 'cp1256']:
                try:
                    df = pd.read_csv(file_path, encoding=encoding,low_memory=False)
                    break  # Exit the loop if reading is successful
                except (UnicodeDecodeError, pd.errors.ParserError):
                   # print(file_path)
                    continue  # Try the next encoding if there's an error
            
            # Remove completely empty rows
            df.dropna(how='all', inplace=True)
        elif file_extension == '.json':
            # Load the JSON file into a DataFrame
            try:
                df = pd.read_json(file_path)
            except ValueError as e:
                if verbose: print(f"Error reading JSON file {file_path}: {e}")
                continue
        else:
            if verbose: print(f"Unsupported file extension: {file_extension}")
            continue
        # Get the first 30 rows including the column names
        preview = df.head(35).to_string(index=False)
        
        # Append the file path and its preview to the dictionary
        previews[file_path] = preview
    
    return previews



previews = get_data_previews(file_paths)



### Define Data Dictionary Creation

The creation of the data dictionary is a step using `openai` to be able to review a sample dataset / csv in a certain format and being able to determine the data type that would likely be necessary to facilitate the use of a certain dataset.

So we accomplish the creation of a data dictionary with the following approach:

- Set the system to be an expert database administrator whose sole job is to create a CSV of a data dictionary and the types so that we can understand a dataset effectively
- Set the assistant as a guide for the system to create a data dictionary effectively based on a single set of instructions with certain limitations on the topic at hand based on the file name
- Set the user message to handle the input of the CSV snippet taken from the original datasets taken from the open data platform.

Below is the definition of the `create_data_dictionary` function:


In [4]:

def create_data_dictionary(preview, csv_name):
    #You will only respond in valid CSV files that do not include | operators or anything else. Only ensure that you create a comma-separated table as a data dictionary"
    response = openai.chat.completions.create(
        model=os.getenv("OR_MODEL"),
        messages=[
            {
            "role": "system",
            "content": "You are an expert in database administration and are able to quickly create a data dictionary from a sample of the first 10 values from a dataset. You will only respond with a data dictionary where you will create the result into 3 columns: variable name (the name of the corresponding variable. If it's not given, provide it), type (describe the actual data type according to the sample provided), description (provide a description of the variable for better understanding of the dataset). You will only respond in valid CSV files that do not include | operators or anything else. Only ensure that you create a comma-separated table as a data dictionary"
            },
            {
            "role": "assistant",
            "content": f"You will help the system understand that it must provide a variable name, data type and description of the variable in a csv format to be able to create a data dictionary. Here is the name of the file for full context: {csv_name}"
            },
            {
            "role": "user",
            "content": "Here is the data sample:" + preview[:min(4020,len(preview))]
            }
        ],
        extra_headers={
            "HTTP-Referer": os.getenv("REFERRER"),
            "X-Title": os.getenv("TITLE"),
        },
        max_completion_tokens=400,
        temperature=0
    )
    return re.sub(r'```csv|```', '', response.choices[0].message.content).strip()


#### Save Data Dictionary and Process CSVs

`save_data_dictionary` - Simple function to save OpenAI's hardwork to a CSV file.
`process_csv_files` - This function takes on a few functionalities for the purpose of the vast amounts of messy datasets does the following: - Get all of the CSVs - Get the first preview of the CSVs - Based on the preview, pass it to OpenAI to create the dataset - Record the CSVs of the data dictionaries - Return the results of the data dictionaries to a list/dictionary


In [5]:
def save_data_dictionary(data_dict, output_path):
    # Save the data dictionary to a CSV file
    with open(output_path, 'w') as f:
        f.write(data_dict)

def process_files(base_dir, max_workers=None):
    csv_file_paths = list_files(base_dir)
    csv_previews = get_data_previews(csv_file_paths)
    data_dictionaries = {}

    def process_single_file(file_path, preview):
        # Get the directory and file name
        dir_name = os.path.dirname(file_path)
        file_name = os.path.basename(file_path)
        
        # Define the output directory and file path
        output_dir = os.path.join(dir_name, 'data_dictionary')
        output_path = os.path.join(output_dir, f"{os.path.splitext(file_name)[0]}_data_dictionary.csv")
        
        # Check if the data dictionary already exists
        if os.path.exists(output_path):
            # Load the existing data dictionary as a string
            with open(output_path, 'r') as f:
                data_dict = f.read()
        else:
            # Create the data dictionary
            data_dict = create_data_dictionary(preview, file_name)
            # Create the output directory if it doesn't exist
            os.makedirs(output_dir, exist_ok=True)
            # Save the data dictionary
            save_data_dictionary(data_dict, output_path)
        
        # Assign the data dictionary to the dictionary with the full file path as the key
        data_dictionaries[file_path] = data_dict

    # Use ThreadPoolExecutor to parallelize the processing of CSV files
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_single_file, file_path, preview) for file_path, preview in csv_previews.items()]
        concurrent.futures.wait(futures)
    
    return data_dictionaries


### Runtime


In [6]:

base_directory = 'opendata'
results = process_files(base_directory)
