# Data Cleaning Note Book for Ed Team

## Import Dependencies

In [12]:
# Import needed libraries for identifying outliers
import pandas as pd
import numpy as np

### Read in Data


In [13]:
data = pd.read_csv('Cafe_sales.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'Cafe_sales.csv'

## Cleaning Methods

### Remove Dupes

In [None]:
df_fixed = data

def no_duplicates(dataframe=df_fixed, column="Transaction ID"):
    df_clean = dataframe.drop_duplicates(subset=column)
    return df_clean


df_no_dupes = no_duplicates()
print(df_no_dupes.shape)
print(df_fixed.shape)

### Find and Replace Outliers

In [None]:
# Function to calculate and replace outliers using the 1.5 Ã— IQR rule
def replace_outliers_with_nan(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = df[col].where((df[col] >= lower_bound) & (df[col] <= upper_bound), np.nan)

# Fixing outliers in 'Total Spent'
replace_outliers_with_nan(df_fixed, 'Total Spent')

### Find and Replace Null Values

In [None]:
# Fill missing values with appropriate strategies for non-numeric columns
df_fixed['Transaction ID'] = df_fixed['Transaction ID'].fillna('Unknown')
df_fixed['Item'] = df_fixed['Item'].fillna('Unknown')
df_fixed['Payment Method'] = df_fixed['Payment Method'].fillna('Unknown')
df_fixed['Location'] = df_fixed['Location'].fillna('Unknown')
df_fixed['Transaction Date'] = df_fixed['Transaction Date'].fillna('Unknown')

# Calculate missing values from the other two columns
# If Total Spent is missing but we have Quantity and Price Per Unit
mask = df_fixed['Total Spent'].isna() & df_fixed['Quantity'].notna() & df_fixed['Price Per Unit'].notna()
df_fixed.loc[mask, 'Total Spent'] = df_fixed.loc[mask, 'Quantity'] * df_fixed.loc[mask, 'Price Per Unit']

# If Price Per Unit is missing but we have Total Spent and Quantity
mask = df_fixed['Price Per Unit'].isna() & df_fixed['Total Spent'].notna() & df_fixed['Quantity'].notna()
df_fixed.loc[mask, 'Price Per Unit'] = df_fixed.loc[mask, 'Total Spent'] / df_fixed.loc[mask, 'Quantity']

# If Quantity is missing but we have Total Spent and Price Per Unit
mask = df_fixed['Quantity'].isna() & df_fixed['Total Spent'].notna() & df_fixed['Price Per Unit'].notna()
df_fixed.loc[mask, 'Quantity'] = df_fixed.loc[mask, 'Total Spent'] / df_fixed.loc[mask, 'Price Per Unit']

# For any remaining missing values, use the mean as fallback
df_fixed['Quantity'] = df_fixed['Quantity'].fillna(df_fixed['Quantity'].mean())
df_fixed['Price Per Unit'] = df_fixed['Price Per Unit'].fillna(df_fixed['Price Per Unit'].mean())
df_fixed['Total Spent'] = df_fixed['Total Spent'].fillna(df_fixed['Total Spent'].mean())

# Round money columns to 2 decimal places
df_fixed['Price Per Unit'] = df_fixed['Price Per Unit'].round(2)
df_fixed['Total Spent'] = df_fixed['Total Spent'].round(2)

In [None]:
df_fixed

## Write Data to New CSV File

In [None]:
# Write to CSV
df_fixed.to_csv('cleaned_cafe_data.csv', index=False)