In [112]:
import dask.dataframe as dd
import pandas as pd
import time
import yaml
import os

In [103]:
file_path = 'us_chronic_disease.csv'
df_pandas = pd.read_csv(file_path, low_memory=False)

In [104]:
start_time = time.time()
df_pandas = pd.read_csv(file_path, low_memory=False)
end_time = time.time()
time_pandas = end_time - start_time
print("Execution time for loading file with Pandas:", time_pandas, "seconds")

Execution time for loading file with Pandas: 3.1925880908966064 seconds


In [105]:
start_time = time.time()
df_dask = dd.read_csv(file_path, dtype='object')
df_dask.compute()
end_time = time.time()
time_dask = end_time - start_time

print("Execution time for loading file with Dask:", time_dask, "seconds")

Execution time for loading file with Dask: 2.5730199813842773 seconds


In [106]:
df_pandas.columns = df_pandas.columns.str.strip()
df_pandas.columns = df_pandas.columns.str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)

print("Cleaned Column Names:")
print(df_pandas.columns)

Cleaned Column Names:
Index(['YearStart', 'YearEnd', 'LocationAbbr', 'LocationDesc', 'DataSource',
       'Topic', 'Question', 'Response', 'DataValueUnit', 'DataValueType',
       'DataValue', 'DataValueAlt', 'DataValueFootnoteSymbol',
       'DatavalueFootnote', 'LowConfidenceLimit', 'HighConfidenceLimit',
       'StratificationCategory1', 'Stratification1', 'StratificationCategory2',
       'Stratification2', 'StratificationCategory3', 'Stratification3',
       'GeoLocation', 'ResponseID', 'LocationID', 'TopicID', 'QuestionID',
       'DataValueTypeID', 'StratificationCategoryID1', 'StratificationID1',
       'StratificationCategoryID2', 'StratificationID2',
       'StratificationCategoryID3', 'StratificationID3'],
      dtype='object')


In [107]:
columns = list(df_pandas.columns)

schema = {
    'columns': columns
}

yaml_file_path = 'schema.yaml'
with open(yaml_file_path, 'w') as yaml_file:
    yaml.dump(schema, yaml_file, default_flow_style=False)

print(f"Schema written to {yaml_file_path}")

Schema written to schema.yaml


In [108]:
with open(yaml_file_path, 'r') as yaml_file:
    schema_loaded = yaml.safe_load(yaml_file)

columns_in_file = list(df_pandas.columns)
columns_in_schema = schema_loaded['columns']

if len(columns_in_file) == len(columns_in_schema):
    print("Number of columns match!")
else:
    print(f"Mismatch in number of columns: {len(columns_in_file)} in file vs {len(columns_in_schema)} in schema")

if columns_in_file == columns_in_schema:
    print("Column names match!")
else:
    print("Mismatch in column names:")
    print("In file:", columns_in_file)
    print("In schema:", columns_in_schema)

Number of columns match!
Column names match!


In [109]:
output_file_path = 'us_chronic_disease_pipe_separated.csv.gz'
df_pandas.to_csv(output_file_path, sep='|', index=False, compression='gzip')

print(f"File written to {output_file_path}")

File written to us_chronic_disease_pipe_separated.csv.gz


In [110]:
num_rows = len(df_pandas)
num_columns = len(df_pandas.columns)

file_size = os.path.getsize(file_path)  

print("Summary of the file:")
print(f"Total number of rows: {num_rows}")
print(f"Total number of columns: {num_columns}")
print(f"File size: {file_size}")

Summary of the file:
Total number of rows: 1185676
Total number of columns: 34
File size: 359317765
