In [2]:
import pandas as pd
from sqlalchemy import create_engine, inspect

In [3]:
# Step 1: Create SQLite engine
engine = create_engine('sqlite:///sales_data.db')

In [4]:
# Step 2: Check if 'sales' table exists
inspector = inspect(engine)
if 'sales' not in inspector.get_table_names():
    print("Table 'sales' not found. Creating sample data...")

     # Create a sample DataFrame
    sample_data = {'sale_id': [1, 2, 3, 4, 4],  # includes a duplicate row (id=4)
               'customer_name': ['Alice', 'Bob', None, 'Charlie', 'Charlie'],
               'product_name': [' Widget A ', 'Widget B', 'Widget C', ' Widget D', ' Widget D '],
               'amount': [120.0, None, -50.0, 300.0, 300.0]
    }
    df_sample = pd.DataFrame(sample_data)
    # Save the DataFrame to the 'sales' table
    df_sample.to_sql('sales', engine, if_exists='replace', index=False)
    print("Sample 'sales' table created.")
else:
    print("Table 'sales' found in database.")



Table 'sales' found in database.


In [5]:
# Step 3: Load the data
df = pd.read_sql("SELECT * FROM sales", engine)
print("\nOriginal Data:\n", df)



Original Data:
    sale_id customer_name product_name  amount
0        1         Alice    Widget A    120.0
1        2           Bob     Widget B     NaN
2        3          None     Widget C   -50.0
3        4       Charlie     Widget D   300.0
4        4       Charlie    Widget D    300.0


In [6]:
# Step 4: Data Inspection
print("\nData Info:")
df.info()
print("\nMissing Values:\n", df.isnull().sum())
print("\nDuplicate Rows:", df.duplicated().sum())




Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   sale_id        5 non-null      int64  
 1   customer_name  4 non-null      object 
 2   product_name   5 non-null      object 
 3   amount         4 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 292.0+ bytes

Missing Values:
 sale_id          0
customer_name    1
product_name     0
amount           1
dtype: int64

Duplicate Rows: 0


In [7]:
# Step 5: Data Cleaning

# Drop duplicate rows
df = df.drop_duplicates()

# Clean 'amount'
df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
df['amount'] = df['amount'].fillna(df['amount'].median())
df = df[df['amount'] > 0]  # remove invalid (e.g., negative or zero) amounts

# Clean 'customer_name'
df['customer_name'] = df['customer_name'].fillna("Unknown Customer")

# Clean 'product_name'
df['product_name'] = df['product_name'].astype(str).str.strip().str.lower()



In [8]:
# Step 6: Save cleaned data
print("\nCleaned Data:\n", df)
df.to_csv("cleaned_sales_data.csv", index=False)
print("\nCleaned data saved to 'cleaned_sales_data.csv'")



Cleaned Data:
    sale_id customer_name product_name  amount
0        1         Alice     widget a   120.0
1        2           Bob     widget b   210.0
3        4       Charlie     widget d   300.0
4        4       Charlie     widget d   300.0

Cleaned data saved to 'cleaned_sales_data.csv'
