In [None]:
# import shutil

# # Directory to be zipped
# directory_path = '../data/anzu/anzu_LatestPublication'

# # Path of the ZIP file to be created
# zip_file_path = '../data/anzu/anzu_LatestPublication'

# # Create the ZIP file
# shutil.make_archive(zip_file_path, 'zip', directory_path)


In [None]:
import os
import zipfile

# Path of the ZIP file to be extracted
zip_file_path = "../data/anzu/anzu_LatestPublication.zip"

# Directory to extract the ZIP file to
extract_dir = "../data/anzu/anzu_LatestPublication"
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
    zip_ref.extractall(extract_dir)

In [None]:
import glob

import pandas as pd

# Get a list of all the Excel files in the directory
excel_files = glob.glob(f"{extract_dir}/*.xls")

# per file read the data for each tab and add it to a file dict
data = {}
for file in excel_files:
    file_data = pd.read_excel(file, sheet_name=None, dtype=str)
    data[file] = file_data

In [None]:
import pandas as pd

# Create an empty list to store the flattened data
flattened_data = []

# Iterate over the data dictionary
for filename, tab_data in data.items():
    # Iterate over each tab in the file
    for tab_name, df in tab_data.items():
        # Add the flattened data to the list
        flattened_data.append([filename, tab_name, df])


# Create the flat DataFrame
flat_df = pd.DataFrame(flattened_data, columns=["Filename", "TabName", "DataFrame"])

In [None]:
flat_df["df_length"] = [len(df) for df in flat_df["DataFrame"]]
flat_df

In [None]:
import csv


def txt_to_csv(file, col_nr=5):
    new_filename = file.replace(".txt", "_clean.csv")

    with open(file) as in_file:
        reader = csv.reader(in_file, delimiter=",", quotechar='"')
        header_cols = len(next(reader))

    with open(file) as in_file, open(new_filename, "w", newline="") as out_file:
        reader = csv.reader(in_file, delimiter=",", quotechar='"')
        writer = csv.writer(out_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)

        for i, record in enumerate(reader):
            cols = len(record)
            if cols == header_cols:
                writer.writerow(record)
            elif cols > header_cols:
                # Assuming extra commas are in the middle text fields only
                new_line = record[:col_nr] + [",".join(record[col_nr : -(col_nr - 1)])] + record[-(col_nr - 1) :]
                writer.writerow(new_line)
            else:
                print(f"Error: Line {i} has fewer columns than expected. Ignoring lines.")
    return new_filename


# read txt files as csv
df_rows = []
txt_files = glob.glob(f"{extract_dir}/*.txt")
for file in txt_files:
    try:
        file_data = pd.read_csv(file, dtype=str)
    except pd.errors.ParserError:
        print(f"Could not read {file}")
        clean_file = txt_to_csv(file)
        file_data = pd.read_csv(clean_file, dtype=str)
    file_name = file.split("/")[-1].strip(".txt")
    df_rows.append([file, file_name, file_data])

txt_df = pd.DataFrame(df_rows, columns=["Filename", "TabName", "DataFrame"])
txt_df["df_length"] = [len(df) for df in txt_df["DataFrame"]]

In [None]:
file_df = pd.concat([flat_df, txt_df])
file_df["TabName"] = file_df["TabName"].str.replace("dbo.", "")
file_df

In [None]:
# for each TabName, check if all the columns are same and sort them by different columns list
def sort_columns(df):
    columns = df.columns
    columns = sorted(columns)
    return columns


file_df["columns"] = [sort_columns(df) for df in file_df["DataFrame"]]
file_df["columns_string"] = file_df["TabName"] + "_" + file_df["columns"].astype(str)

file_df["columns_string"].value_counts()

# check if multiple unique columns_string are present per TabName
for tab in file_df["TabName"].unique():
    tab_df = file_df[file_df["TabName"] == tab]
    if len(tab_df["columns_string"].unique()) > 1:
        print(tab_df["columns_string"].unique())
        print("\n")

In [None]:
# keep only the tables with most number of rows per TabName
file_df = file_df.sort_values(by=["df_length"], ascending=False)
file_df = file_df.drop_duplicates(subset=["TabName"], keep="first")
file_df

In [None]:
# write each dataframe to a csv file with the TabName as the filename
path = "../data/anzu/anzu_cleaned"
if not os.path.exists(path):
    os.makedirs(path)
for i, row in file_df.iterrows():
    filename = row["TabName"]
    df = row["DataFrame"]
    df.to_csv(f"{path}/{filename}.csv", index=False)
    print(f"Written {filename}.csv")