In [2]:
import pandas as pd

# Load the original datasets
original_menu_item_df = pd.read_csv(r'C:\Users\Soumya Nanda\NYPL-menus\MenuItem.csv')
original_dish_df = pd.read_csv(r'C:\Users\Soumya Nanda\NYPL-menus\Dish.csv')
original_menu_df = pd.read_csv(r'C:\Users\Soumya Nanda\NYPL-menus\Menu.csv')
original_menu_page_df = pd.read_csv(r'C:\Users\Soumya Nanda\NYPL-menus\MenuPage.csv')

# Load the cleaned datasets
cleaned_menu_item_df = pd.read_csv(r'C:\Users\Soumya Nanda\NYPL-menus\OpenRefine\NYPL-MenuItem.csv')
cleaned_dish_df = pd.read_csv(r'C:\Users\Soumya Nanda\NYPL-menus\OpenRefine\NYPL-Dish.csv')
cleaned_menu_df = pd.read_csv(r'C:\Users\Soumya Nanda\NYPL-menus\OpenRefine\NYPL-Menu.csv')
cleaned_menu_page_df = pd.read_csv(r'C:\Users\Soumya Nanda\NYPL-menus\OpenRefine\NYPL-MenuPage.csv')



In [3]:
# Function to check primary key integrity
def check_primary_key_integrity(df, primary_key):
    return df[primary_key].is_unique

# Function to check foreign key integrity
def check_foreign_key_integrity(df, foreign_key, referenced_df, referenced_key):
    return df[foreign_key].isin(referenced_df[referenced_key]).all()

# Function to check for missing values in critical fields
def check_missing_values(df, critical_fields):
    return df[critical_fields].isnull().sum()

# Function to check data types
def check_data_types(df, expected_types):
    actual_types = df.dtypes
    return all(actual_types[col] == expected_types[col] for col in expected_types)


In [4]:
# Expected data types
expected_dish_types = {
    'id': 'int64',
    'name': 'object',
    'first_appeared': 'int64',
    'last_appeared': 'int64'
}
expected_menu_item_types = {
    'menu_item_id': 'int64',
    'menu_page_id': 'int64',
    'price': 'float64',
    'high_price': 'float64',
    'dish_id': 'int64',
    'date': 'float64'
}


In [11]:
# Integrity checks before cleaning
integrity_checks_before = {
    'MenuItem Primary Key': check_primary_key_integrity(original_menu_item_df, 'id'),
    'Dish Primary Key': check_primary_key_integrity(original_dish_df, 'id'),
    'MenuPage Primary Key': check_primary_key_integrity(original_menu_page_df, 'id'),
    'Menu Primary Key': check_primary_key_integrity(original_menu_df, 'id'),
    'MenuItem Foreign Key dish_id': check_foreign_key_integrity(original_menu_item_df, 'dish_id', original_dish_df, 'id'),
    'MenuItem Foreign Key menu_page_id': check_foreign_key_integrity(original_menu_item_df, 'menu_page_id', original_menu_page_df, 'id'),
    'MenuPage Foreign Key menu_id': check_foreign_key_integrity(original_menu_page_df, 'menu_id', original_menu_df, 'id'),
    'MenuItem Missing Values': check_missing_values(original_menu_item_df, ['price', 'high_price', 'dish_id', 'menu_page_id']),
    'Dish Missing Values': check_missing_values(original_dish_df, ['name', 'first_appeared', 'last_appeared']),
    'Data Types Dish': check_data_types(original_dish_df, expected_dish_types),
   # 'Data Types MenuItem': check_data_types(original_menu_item_df, expected_menu_item_types)
}


In [12]:
# Integrity checks after cleaning
integrity_checks_after = {
    'MenuItem Primary Key': check_primary_key_integrity(cleaned_menu_item_df, 'id'),
    'Dish Primary Key': check_primary_key_integrity(cleaned_dish_df, 'id'),
    'MenuPage Primary Key': check_primary_key_integrity(cleaned_menu_page_df, 'id'),
    'Menu Primary Key': check_primary_key_integrity(cleaned_menu_df, 'id'),
    'MenuItem Foreign Key dish_id': check_foreign_key_integrity(cleaned_menu_item_df, 'dish_id', cleaned_dish_df, 'id'),
    'MenuItem Foreign Key menu_page_id': check_foreign_key_integrity(cleaned_menu_item_df, 'menu_page_id', cleaned_menu_page_df, 'id'),
    'MenuPage Foreign Key menu_id': check_foreign_key_integrity(cleaned_menu_page_df, 'menu_id', cleaned_menu_df, 'id'),
    'MenuItem Missing Values': check_missing_values(cleaned_menu_item_df, ['price', 'high_price', 'dish_id', 'menu_page_id']),
    'Dish Missing Values': check_missing_values(cleaned_dish_df, ['name', 'first_appeared', 'last_appeared']),
    'Data Types Dish': check_data_types(cleaned_dish_df, expected_dish_types),
   # 'Data Types MenuItem': check_data_types(cleaned_menu_item_df, expected_menu_item_types)
}

In [13]:
# Prepare integrity check results for HTML table
integrity_results = {
    'Constraint': [],
    'Before Cleaning': [],
    'After Cleaning': []
}

for key in integrity_checks_before:
    integrity_results['Constraint'].append(key)
    integrity_results['Before Cleaning'].append(integrity_checks_before[key])
    integrity_results['After Cleaning'].append(integrity_checks_after[key])

# Convert results to DataFrame
integrity_results_df = pd.DataFrame(integrity_results)

# Generate HTML table
html_table = integrity_results_df.to_html(index=False)

# Save HTML table to a file
with open(r'C:\Users\Soumya Nanda\NYPL-menus\OpenRefine\integrity_violations_summary.html', 'w') as file:
    file.write(html_table)

# Display the HTML table
from IPython.core.display import display, HTML
display(HTML(html_table))

print("Integrity constraints violation summary saved as HTML table.")

  from IPython.core.display import display, HTML


Constraint,Before Cleaning,After Cleaning
MenuItem Primary Key,True,True
Dish Primary Key,True,True
MenuPage Primary Key,True,True
Menu Primary Key,True,True
MenuItem Foreign Key dish_id,False,True
MenuItem Foreign Key menu_page_id,True,True
MenuPage Foreign Key menu_id,False,True
MenuItem Missing Values,price 445916 high_price 1240821 dish_id 241 menu_page_id 0 dtype: int64,price 0 high_price 0 dish_id 0 menu_page_id 0 dtype: int64
Dish Missing Values,name 0 first_appeared 0 last_appeared 0 dtype: int64,name 0 first_appeared 0 last_appeared 0 dtype: int64
Data Types Dish,True,True


Integrity constraints violation summary saved as HTML table.


In [20]:
import pandas as pd

# Function to summarize datasets
def summarize_datasets(original_path, cleaned_path, dataset_name):
    # Load the original and cleaned datasets
    original_df = pd.read_csv(original_path)
    cleaned_df = pd.read_csv(cleaned_path)

    # Ensure both DataFrames have the same set of columns
    all_columns = sorted(set(original_df.columns).union(set(cleaned_df.columns)))

    # Reindex the DataFrames to have the same columns
    original_df = original_df.reindex(columns=all_columns)
    cleaned_df = cleaned_df.reindex(columns=all_columns)

    # Fill missing columns with NaN
    original_df.fillna(value=pd.NA, inplace=True)
    cleaned_df.fillna(value=pd.NA, inplace=True)

    # Summarize rows and columns in original vs cleaned version
    original_summary = {
        'Rows': original_df.shape[0],
        'Columns': original_df.shape[1]
    }

    cleaned_summary = {
        'Rows': cleaned_df.shape[0],
        'Columns': cleaned_df.shape[1]
    }

    # Summary of missing values for each field
    original_missing_values = original_df.isnull().sum()
    cleaned_missing_values = cleaned_df.isnull().sum()

    # Summary of data types for each field
    original_data_types = original_df.dtypes
    cleaned_data_types = cleaned_df.dtypes

    # Prepare data for HTML table
    summary_data = {
        'Metric': ['Rows', 'Columns'],
        'Original': [original_summary['Rows'], original_summary['Columns']],
        'Cleaned': [cleaned_summary['Rows'], cleaned_summary['Columns']]
    }

    missing_values_data = {
        'Field': all_columns,
        'Original Missing Values': original_missing_values.values.tolist(),
        'Cleaned Missing Values': cleaned_missing_values.values.tolist(),
        'Original Data Type': original_data_types.astype(str).values.tolist(),
        'Cleaned Data Type': cleaned_data_types.astype(str).values.tolist()
    }

    # Convert to DataFrame
    summary_df = pd.DataFrame(summary_data)
    missing_values_df = pd.DataFrame(missing_values_data)

    # Generate HTML tables
    summary_html = summary_df.to_html(index=False)
    missing_values_html = missing_values_df.to_html(index=False)
    original_head_html = original_df.head().to_html(index=False)
    cleaned_head_html = cleaned_df.head().to_html(index=False)

    # Combine HTML tables
    html_content = f"""
    <h2>Summary of {dataset_name} Dataset</h2>
    <h3>Rows and Columns</h3>
    {summary_html}
    <h3>Missing Values and Data Types</h3>
    {missing_values_html}
    <h3>First Few Rows - Original Dataset</h3>
    {original_head_html}
    <h3>First Few Rows - Cleaned Dataset</h3>
    {cleaned_head_html}
    """

    return html_content

# Define paths for original and cleaned datasets
paths = {
    'MenuItem': {
        'original': r'C:\Users\Soumya Nanda\NYPL-menus\MenuItem.csv',
        'cleaned': r'C:\Users\Soumya Nanda\NYPL-menus\OpenRefine\NYPL-MenuItem.csv'
    },
    'Dish': {
        'original': r'C:\Users\Soumya Nanda\NYPL-menus\Dish.csv',
        'cleaned': r'C:\Users\Soumya Nanda\NYPL-menus\OpenRefine\NYPL-Dish.csv'
    },
    'MenuPage': {
        'original': r'C:\Users\Soumya Nanda\NYPL-menus\MenuPage.csv',
        'cleaned': r'C:\Users\Soumya Nanda\NYPL-menus\OpenRefine\NYPL-MenuPage.csv'
    },
    'Menu': {
        'original': r'C:\Users\Soumya Nanda\NYPL-menus\Menu.csv',
        'cleaned': r'C:\Users\Soumya Nanda\NYPL-menus\OpenRefine\NYPL-Menu.csv'
    }
}

# Generate summaries for all datasets
html_contents = []
for dataset_name, path in paths.items():
    html_content = summarize_datasets(path['original'], path['cleaned'], dataset_name)
    html_contents.append(html_content)

# Combine all HTML contents
full_html_content = "\n".join(html_contents)

# Save HTML content to a file
with open(r'C:\Users\Soumya Nanda\NYPL-menus\OpenRefine\dataset_summaries.html', 'w') as file:
    file.write(full_html_content)

# Display the HTML content
from IPython.core.display import display, HTML
display(HTML(full_html_content))

print("Dataset summaries saved as HTML table.")


  from IPython.core.display import display, HTML


Metric,Original,Cleaned
Rows,1332726,273237
Columns,16,16

Field,Original Missing Values,Cleaned Missing Values,Original Data Type,Cleaned Data Type
created_at,0,0,object,object
date,1332726,0,float64,int64
dish_id,241,0,float64,int64
dish_name,1332726,0,float64,object
high_price,1240821,0,float64,float64
id,0,0,int64,int64
id_dish,1332726,0,float64,int64
id_menu,1332726,0,float64,int64
id_page,1332726,0,float64,int64
menu_date,1332726,0,float64,object

created_at,date,dish_id,dish_name,high_price,id,id_dish,id_menu,id_page,menu_date,menu_id_from_page,menu_page_id,price,updated_at,xpos,ypos
2011-03-28 15:00:44 UTC,,1.0,,,1,,,,,,1389,0.4,2011-04-19 04:33:15 UTC,0.111429,0.254735
2011-03-28 15:01:13 UTC,,2.0,,,2,,,,,,1389,0.6,2011-04-19 15:00:54 UTC,0.438571,0.254735
2011-03-28 15:01:40 UTC,,3.0,,,3,,,,,,1389,0.4,2011-04-19 19:10:05 UTC,0.14,0.261922
2011-03-28 15:01:51 UTC,,4.0,,,4,,,,,,1389,0.5,2011-04-19 19:07:01 UTC,0.377143,0.26272
2011-03-28 15:21:26 UTC,,5.0,,1.0,5,,,,,,3079,0.5,2011-04-13 15:25:27 UTC,0.105714,0.313178

created_at,date,dish_id,dish_name,high_price,id,id_dish,id_menu,id_page,menu_date,menu_id_from_page,menu_page_id,price,updated_at,xpos,ypos
2011-03-28 15:01:13 UTC,1900,2,Chicken gumbo,0.6,2,2,12882,1389,1900-04-09T00:00:00Z,12882,1389,0.6,2011-04-19 15:00:54 UTC,0.438571,0.254735
2011-03-28 15:01:40 UTC,1900,3,Tomato aux croutons,0.4,3,3,12882,1389,1900-04-09T00:00:00Z,12882,1389,0.4,2011-04-19 19:10:05 UTC,0.14,0.261922
2011-03-28 15:01:51 UTC,1900,4,Onion au gratin,0.5,4,4,12882,1389,1900-04-09T00:00:00Z,12882,1389,0.5,2011-04-19 19:07:01 UTC,0.377143,0.26272
2011-03-28 15:21:26 UTC,1900,5,St. Emilion,1.0,5,5,13472,3079,1900-05-21T00:00:00Z,13472,3079,0.5,2011-04-13 15:25:27 UTC,0.105714,0.313178
2011-03-28 19:33:37 UTC,1900,9,Clam broth (cup),0.25,8,9,12882,1389,1900-04-09T00:00:00Z,12882,1389,0.25,2011-04-19 19:06:08 UTC,0.167143,0.273101

Metric,Original,Cleaned
Rows,423397,11033
Columns,9,9

Field,Original Missing Values,Cleaned Missing Values,Original Data Type,Cleaned Data Type
description,423397,11033,float64,float64
first_appeared,0,0,int64,int64
highest_price,29100,0,float64,float64
id,0,0,int64,int64
last_appeared,0,0,int64,int64
lowest_price,29100,0,float64,float64
menus_appeared,0,0,int64,int64
name,0,0,object,object
times_appeared,0,0,int64,int64

description,first_appeared,highest_price,id,last_appeared,lowest_price,menus_appeared,name,times_appeared
,1897,0.4,1,1927,0.2,8,Consomme printaniere royal,8
,1895,0.8,2,1960,0.1,111,Chicken gumbo,117
,1893,0.4,3,1917,0.25,13,Tomato aux croutons,13
,1900,1.0,4,1971,0.25,41,Onion au gratin,41
,1881,18.0,5,1981,0.0,66,St. Emilion,68

description,first_appeared,highest_price,id,last_appeared,lowest_price,menus_appeared,name,times_appeared
,1895,0.8,2,1960,0.1,111,Chicken gumbo,117
,1893,0.4,3,1917,0.25,13,Tomato aux croutons,13
,1900,1.0,4,1971,0.25,41,Onion au gratin,41
,1881,18.0,5,1981,0.0,66,St. Emilion,68
,1897,0.6,8,1961,0.1,48,Chicken soup with rice,49

Metric,Original,Cleaned
Rows,66937,61134
Columns,7,7

Field,Original Missing Values,Cleaned Missing Values,Original Data Type,Cleaned Data Type
full_height,329,0,float64,float64
full_width,329,0,float64,float64
id,0,0,int64,int64
image_id,0,0,object,object
menu_id,0,0,int64,int64
page_number,1202,945,float64,float64
uuid,0,0,object,object

full_height,full_width,id,image_id,menu_id,page_number,uuid
7230.0,5428.0,119,1603595,12460,1.0,510d47e4-2955-a3d9-e040-e00a18064a99
5428.0,7230.0,120,1603596,12460,2.0,510d47e4-2956-a3d9-e040-e00a18064a99
7230.0,5428.0,121,1603597,12460,3.0,510d47e4-2957-a3d9-e040-e00a18064a99
7230.0,5428.0,122,1603598,12460,4.0,510d47e4-2958-a3d9-e040-e00a18064a99
7230.0,5428.0,123,1603591,12461,1.0,510d47e4-2959-a3d9-e040-e00a18064a99

full_height,full_width,id,image_id,menu_id,page_number,uuid
3074.0,2046.0,129,4000009170,12463,2.0,510d47db-491e-a3d9-e040-e00a18064a99
3049.0,2004.0,130,466928,12463,1.0,510D47DB-491F-A3D9-E040-E00A18064A99
3690.0,2888.0,131,4000009171,12464,2.0,510d47db-4920-a3d9-e040-e00a18064a99
3679.0,2866.0,132,466930,12464,1.0,510d47db-4921-a3d9-e040-e00a18064a99
3413.0,2307.0,133,4000009172,12465,2.0,510d47db-4922-a3d9-e040-e00a18064a99

Metric,Original,Cleaned
Rows,17545,17545
Columns,20,20

Field,Original Missing Values,Cleaned Missing Values,Original Data Type,Cleaned Data Type
call_number,1562,1562,object,object
currency,11089,11089,object,object
currency_symbol,11089,11089,object,object
date,586,586,object,object
dish_count,0,0,int64,int64
event,9391,9391,object,object
id,0,0,int64,int64
keywords,17545,17545,float64,float64
language,17545,17545,float64,float64
location,0,0,object,object

call_number,currency,currency_symbol,date,dish_count,event,id,keywords,language,location,location_type,name,notes,occasion,page_count,physical_description,place,sponsor,status,venue
1900-2822,,,1900-04-15,67,BREAKFAST,12463,,,Hotel Eastman,,,,EASTER;,2,CARD; 4.75X7.5;,"HOT SPRINGS, AR",HOTEL EASTMAN,complete,COMMERCIAL
1900-2825,,,1900-04-15,34,[DINNER],12464,,,Republican House,,,"WEDGEWOOD BLUE CARD; WHITE EMBOSSED GREEK KEY BORDER; ""EASTER SUNDAY"" EMBOSSED IN WHITE; VIOLET COLORED SPRAY OF FLOWERS IN UPPER LEFT CORNER;",EASTER;,2,CARD; ILLUS; COL; 7.0X9.0;,"MILWAUKEE, [WI];",REPUBLICAN HOUSE,under review,COMMERCIAL
1900-2827,,,1900-04-16,84,FRUHSTUCK/BREAKFAST;,12465,,,Norddeutscher Lloyd Bremen,,,"MENU IN GERMAN AND ENGLISH; ILLUS, STEAMSHIP AND SAILING VESSEL;",,2,CARD; ILLU; COL; 5.5X8.0;,DAMPFER KAISER WILHELM DER GROSSE;,NORDDEUTSCHER LLOYD BREMEN,complete,COMMERCIAL
1900-2828,,,1900-04-16,63,LUNCH;,12466,,,Norddeutscher Lloyd Bremen,,,"MENU IN GERMAN AND ENGLISH; ILLUS, HARBOR SCENE WITH SAILING VESSEL;",,2,CARD; ILLU; COL; 5.5X8.0;,DAMPFER KAISER WILHELM DER GROSSE;,NORDDEUTSCHER LLOYD BREMEN,complete,COMMERCIAL
1900-2829,,,1900-04-16,33,DINNER;,12467,,,Norddeutscher Lloyd Bremen,,,"MENU IN GERMAN AND ENGLISH; ILLUS, HARBOR SCENE WITH ROCKS AND LIGHTHOUSE; STEAMSHIP AND SAILING VESSELS; CONCERT PROGRAM; DATES: ON GERMAN SIDE OF MENU ""MONTAG, DEN 16 APRIL 1900""; ON ENGLISH SIDE OF MENU ""MONDAY, APRIL 15TH, 1900"";",,4,FOLDER; ILLU; COL; 5.5X7.5;,DAMPFER KAISER WILHELM DER GROSSE;,NORDDEUTSCHER LLOYD BREMEN,complete,COMMERCIAL

call_number,currency,currency_symbol,date,dish_count,event,id,keywords,language,location,location_type,name,notes,occasion,page_count,physical_description,place,sponsor,status,venue
1900-2822,,,1900-04-15T00:00:00Z,67,BREAKFAST,12463,,,Hotel Eastman,,,,EASTER;,2,CARD; 4.75X7.5;,"HOT SPRINGS, AR",HOTEL EASTMAN,complete,COMMERCIAL
1900-2825,,,1900-04-15T00:00:00Z,34,DINNER,12464,,,Republican House,,,"WEDGEWOOD BLUE CARD; WHITE EMBOSSED GREEK KEY BORDER; ""EASTER SUNDAY"" EMBOSSED IN WHITE; VIOLET COLORED SPRAY OF FLOWERS IN UPPER LEFT CORNER;",EASTER;,2,CARD; ILLUS; COL; 7.0X9.0;,"MILWAUKEE, [WI];",REPUBLICAN HOUSE,under review,COMMERCIAL
1900-2827,,,1900-04-16T00:00:00Z,84,FRUHSTUCK-BREAKFAST,12465,,,Norddeutscher Lloyd Bremen,,,"MENU IN GERMAN AND ENGLISH; ILLUS, STEAMSHIP AND SAILING VESSEL;",,2,CARD; ILLU; COL; 5.5X8.0;,DAMPFER KAISER WILHELM DER GROSSE;,NORDDEUTSCHER LLOYD BREMEN,complete,COMMERCIAL
1900-2828,,,1900-04-16T00:00:00Z,63,LUNCH,12466,,,Norddeutscher Lloyd Bremen,,,"MENU IN GERMAN AND ENGLISH; ILLUS, HARBOR SCENE WITH SAILING VESSEL;",,2,CARD; ILLU; COL; 5.5X8.0;,DAMPFER KAISER WILHELM DER GROSSE;,NORDDEUTSCHER LLOYD BREMEN,complete,COMMERCIAL
1900-2829,,,1900-04-16T00:00:00Z,33,DINNER,12467,,,Norddeutscher Lloyd Bremen,,,"MENU IN GERMAN AND ENGLISH; ILLUS, HARBOR SCENE WITH ROCKS AND LIGHTHOUSE; STEAMSHIP AND SAILING VESSELS; CONCERT PROGRAM; DATES: ON GERMAN SIDE OF MENU ""MONTAG, DEN 16 APRIL 1900""; ON ENGLISH SIDE OF MENU ""MONDAY, APRIL 15TH, 1900"";",,4,FOLDER; ILLU; COL; 5.5X7.5;,DAMPFER KAISER WILHELM DER GROSSE;,NORDDEUTSCHER LLOYD BREMEN,complete,COMMERCIAL


Dataset summaries saved as HTML table.
