# Transactions Data Preprocessing

## Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
print("Libraries imported successfully")

## Load Raw Data

In [None]:
# Load transactions data
transactions = pd.read_csv("transactions_data.csv")
print("Original Transactions Data Shape:", transactions.shape)

## Data Inspection

In [None]:
# Display first few rows
print("First few rows of the data:")
print(transactions.head())

In [None]:
# Check data types
print("Data Types:")
print(transactions.dtypes)

In [None]:
# Check missing values
print("Missing Values:")
print(transactions.isnull().sum())

In [None]:
# Check basic statistics
print("Basic Statistics:")
print(transactions.describe())

## Data Cleaning

In [None]:
# Create copy for cleaning
transactions_cleaned = transactions.copy()

In [None]:
# Define date standardization function
def standardize_date(date_str):
    try:
        return pd.to_datetime(date_str)
    except:
        return pd.NaT

print("Date standardization function created")

In [None]:
# Clean Transaction_Date
transactions_cleaned["Transaction_Date"] = transactions_cleaned["Transaction_Date"].apply(standardize_date)
print("Transaction_Date standardized")

In [None]:
# Handle missing Quantity
transactions_cleaned["Quantity"] = transactions_cleaned["Quantity"].fillna(transactions_cleaned["Quantity"].median())
print("Missing Quantity handled")

In [None]:
# Handle missing Product_Price
transactions_cleaned["Product_Price"] = transactions_cleaned["Product_Price"].fillna(transactions_cleaned["Product_Price"].median())
print("Missing Product_Price handled")

In [None]:
# Handle missing Total_Cost
transactions_cleaned["Total_Cost"] = transactions_cleaned["Total_Cost"].fillna(transactions_cleaned["Total_Cost"].median())
print("Missing Total_Cost handled")

In [None]:
# Handle missing Transaction_ID
transactions_cleaned["Transaction_ID"] = transactions_cleaned["Transaction_ID"].fillna(transactions_cleaned["Transaction_ID"].max() + 1)
print("Missing Transaction_ID handled")

In [None]:
# Handle missing Company_ID and Product_ID
transactions_cleaned["Company_ID"] = transactions_cleaned["Company_ID"].fillna(transactions_cleaned["Company_ID"].median())
transactions_cleaned["Product_ID"] = transactions_cleaned["Product_ID"].fillna(transactions_cleaned["Product_ID"].median())
print("Missing Company_ID and Product_ID handled")

In [None]:
# Remove unnecessary columns
transactions_cleaned = transactions_cleaned.drop("Unnamed: 0", axis=1)
print("Unnecessary columns removed")

## Verification

In [None]:
# Check for any remaining missing values
print("Missing Values After Cleaning:")
print(transactions_cleaned.isnull().sum())

In [None]:
# Display cleaned data statistics
print("Cleaned Data Statistics:")
print(transactions_cleaned.describe())

## Save Cleaned Data

In [None]:
# Save cleaned transactions data
transactions_cleaned.to_csv("transactions_cleaned.csv", index=False)
print("Cleaned transactions data saved to transactions_cleaned.csv")