In [45]:
import pandas as pd
import string

In [46]:
# load the data for comparision

df1 = pd.read_csv('Menu.csv')
df2 = pd.read_csv('Menu_cleaned_format.csv')

In [48]:
df1i = df1
df2i = df2

df1i.set_index('id', inplace=True)
df2i.set_index('id', inplace=True)

sponsor_before = df1i[['sponsor']]
sponsor_after = df2i[['sponsor']]

all_indices = sponsor_before.index.union(sponsor_after.index)
sponsor_before_aligned = sponsor_before.reindex(index=all_indices).fillna('MISSING')
sponsor_after_aligned = sponsor_after.reindex(index=all_indices).fillna('MISSING')

comparison = sponsor_before_aligned != sponsor_after_aligned
num_changed_cells = comparison.sum().sum()

removed_indices = sponsor_before.index.difference(sponsor_after.index)
num_removed_cells = len(removed_indices)

print(f"Number of changed cells in 'sponsor' column: {num_changed_cells}")
print(f"Number of removed cells in 'sponsor' column: {num_removed_cells}")

changed_indices = comparison[comparison['sponsor']].index
for idx in changed_indices[:20]:
    before_value = sponsor_before_aligned.at[idx, 'sponsor']
    after_value = sponsor_after_aligned.at[idx, 'sponsor']
    print(f"ID: {idx}, Before: {before_value}, After: {after_value}")

Number of changed cells in 'sponsor' column: 9415
Number of removed cells in 'sponsor' column: 1618
ID: 12469, Before: HOTEL NETHERLAND, After: NETHERLAND HOTEL 
ID: 12475, Before: MANHATTAN HOTEL, After: MANHATTAN HOTEL 
ID: 12483, Before: MANHATTAN HOTEL, After: MANHATTAN HOTEL 
ID: 12486, Before: HOTEL SAVOY, After: SAVOY HOTEL 
ID: 12490, Before: HOTEL IMPERIAL, After: IMPERIAL HOTEL
ID: 12508, Before: MARIE ANTOINETTE HOTEL, After: MARIE ANTOINETTE HOTEL 
ID: 12510, Before: MARIE ANTOINETTE HOTEL, After: MARIE ANTOINETTE HOTEL 
ID: 12551, Before: NORDDEUTSCHER LLOYD  BREMEN, After: NORDDEUTSCHER LLOYD BREMEN
ID: 12554, Before: HOTEL DEL CORONADO, After: DEL CORONADO HOTEL 
ID: 12557, Before: HOTEL VICTORY, After: VICTORY HOTEL 
ID: 12576, Before: U.S. ARMY - SUBSISTENCE DEPT., After: U.S. ARMY - SUBSISTENCE DEPT
ID: 12703, Before: VENDOME HOTEL, After: VENDOME HOTEL 
ID: 12704, Before: NETHERLAND HOTEL, After: NETHERLAND HOTEL 
ID: 12714, Before: HOTEL VICTORY, After: VICTORY HOTE

In [50]:
# check how the number nan and empty 

num_nan1 = df1['sponsor'].isna().sum()
num_empty1 = (df1['sponsor'] == '').sum()
num_nan2 = df2['sponsor'].isna().sum()
num_empty2 = (df2['sponsor'] == '').sum()

print(f'Original: Number of NaN cells: {num_nan1}')
print(f'Original: Number of empty cells: {num_empty1}')
print(f'Modified: Number of NaN cells: {num_nan2}')
print(f'Modified: Number of empty cells: {num_empty2}')

Original: Number of NaN cells: 1561
Original: Number of empty cells: 0
Modified: Number of NaN cells: 0
Modified: Number of empty cells: 0


In [51]:
# check if any cells are not either lower case, upper case or title case

def is_not_case_compliant(text):
    if pd.isna(text):  # Treat NaN values as compliant
        return False
    text = str(text)
    return not (text.isupper() or text.islower() or text.istitle())

num_non_compliant1 = df1['sponsor'].apply(is_not_case_compliant).sum()
num_non_compliant2 = df2['sponsor'].apply(is_not_case_compliant).sum()

print(f'Original: Number of cells not in upper case, lower case, or title case: {num_non_compliant1}')
print(f'Modified: Number of cells not in upper case, lower case, or title case: {num_non_compliant2}')

Original: Number of cells not in upper case, lower case, or title case: 1404
Modified: Number of cells not in upper case, lower case, or title case: 0


In [52]:
# check how many unique cells

num_unique_values1 = df1['sponsor'].nunique()
num_unique_values2 = df2['sponsor'].nunique()

print(f'Original: Number of unique values in the sponsor column: {num_unique_values1}')
print(f'Modified: Number of unique values in the sponsor column: {num_unique_values2}')

Original: Number of unique values in the sponsor column: 6370
Modified: Number of unique values in the sponsor column: 5928


In [56]:
# check the sponsors that have multiple location

location_counts1 = df1.groupby('sponsor')['location'].nunique()
multiple_locations1 = location_counts1[location_counts1 > 1]
num_sponsors_multiple_locations1 = multiple_locations1.count()

location_counts2 = df2.groupby('sponsor')['location'].nunique()
multiple_locations2 = location_counts2[location_counts2 > 1]
num_sponsors_multiple_locations2 = multiple_locations2.count()

print(f'Original: Number of sponsors with multiple locations: {num_sponsors_multiple_locations1}')
print(f'Modified: Number of sponsors with multiple locations: {num_sponsors_multiple_locations2}')

print("\nOriginal: Sponsors with multiple locations and their count of unique locations:")
print(multiple_locations1)

print("\nModified: Sponsors with multiple locations and their count of unique locations:")
print(multiple_locations2)

Original: Number of sponsors with multiple locations: 93
Modified: Number of sponsors with multiple locations: 275

Original: Sponsors with multiple locations and their count of unique locations:
sponsor
(AMERICAN LINE)                                 2
?                                              15
ALCAZAR HOTEL                                   2
BALTIMORE AND OHIO RR ROYAL BLUE LINE           2
BELVEDERE HOUSE;                                2
                                               ..
Unknown                                         3
VENDOME HOTEL                                   2
Waldorf-Astoria                                 2
[Restaurant And/Or Location Unknown]            2
[Restaurant Name And/Or Location Not Given]     9
Name: location, Length: 93, dtype: int64

Modified: Sponsors with multiple locations and their count of unique locations:
sponsor
ADAMS' RESTAURANT                                    2
ALCAZAR HOTEL                                        2
ALGONQ