In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Load raw data
file_path = "../Data/raw/incom2024_delay_example_dataset.csv"

data_log = pd.read_csv(file_path)
display(data_log.head())
 

In [None]:
#Check for null values
#data_log.isnull() #This returns a Dataframe with false values indicating that there are no missing or null values
#data_log.isna().any()   # We use this to check for null values in each of the columns
#daat_lo.isna().sum()  #We use this to estimate the number of null values in each column

In [None]:
#Checking for duplicate columns
#data_log.duplicated()

In [None]:
# data_log.set_option('display.max_row', None)
#pd.set_option('display.max_columns',None)
#display(data_log)

In [None]:
#Drop specific columns : The customer_
data_log_1 = data_log.drop(columns=["customer_country", "customer_state", "order_state", "order_region"], errors ='ignore')
display(data_log_1.head())

In [None]:
# Convert order_date to datetime
data_log_1['order_date'] = pd.to_datetime(data_log_1['order_date'], errors='coerce', utc=True)

# Drop rows where conversion failed (NaT values)
data_log_1 = data_log_1.dropna(subset=['order_date'])

# First extract time before modifying order_date
data_log_1['order_time'] = data_log_1['order_date'].dt.time  # Extract time

# Now extract the date (this step changes order_date to object)
data_log_1['order_date_only'] = data_log_1['order_date'].dt.date  # Extract date

# Show results
print(data_log_1[['order_date', 'order_date_only', 'order_time']].head())






In [None]:
# Ensure the 'shipping_date' column is in datetime format with UTC option 
data_log_1['shipping_date'] = pd.to_datetime(data_log_1['shipping_date'], errors='coerce', utc=True)

# Drop rows where conversion failed (NaT values)
data_log_1 = data_log_1.dropna(subset=['shipping_date'])

# Now extract time before modifying shipping_date
data_log_1['shipping_time'] = data_log_1['shipping_date'].dt.time  # Extract time

# Now extract the date (this step changes shipping_date to object)
data_log_1['shipping_date_only'] = data_log_1['shipping_date'].dt.date  # Extract date

# Show results
print(data_log_1[['shipping_date', 'shipping_date_only', 'shipping_time']].head())


In [None]:
display(data_log_1.head())

In [None]:
# Save data_log_1 to CSV
#data_log_1.to_csv("../Data/Cleaned/income2024_cleaned.csv", index=False)
#print("data_log_1 saved to income2024_cleaned.csv")

In [None]:
# Dispalay file containing the description about the data 

file_path = "../Data/raw/incom2024_delay_variable_description.csv"

data_desc = pd.read_csv(file_path)
display(data_desc)



In [None]:
data_log_1.columns # Standardizing columns

In [None]:
data_log_1.info()

In [None]:
data_log_1.isnull().sum()

## UNIVARIATE ANALYSIS ## 

In [None]:
#Separate numerical and categorical columns
num_cols = data_log_1.select_dtypes(include=['float64', 'int64']).columns
cat_cols = data_log_1.select_dtypes(include=['object']).columns

In [None]:
# Summary statistics for numerical data
print("\nSummary statistics for numerical data: ")
display(data_log_1[num_cols].describe())

In [None]:
#Value counts for categorical data
print("\nValue counts for categorical data: ")
for col in cat_cols: 
    print(f"\n{col}:\n", data_log_1[col].value_counts())


## Visualization

In [None]:
# Histogram for numerical features
# Histograms for numerical features
for col in num_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(data_log_1[col], bins=30, kde=True)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()


In [None]:
#Boxplots for numerical features: check for outliers
for col in num_cols:
    plt.figure9figsize=(10, 6)
    sns.boxplot(x=data_log_1[col])
    plt.title(f"{col}")
    plt.show()

In [None]:
for col in cat_cols:
    plt.figure(figsize=(10, 5))
    data_log_1[col].value_counts().nlargest(10).plot(kind='bar', color='skyblue')
    plt.title(f"{col}")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.show()

## Bivariate Analysis ##
