In [1]:
# ===============================
# PART 1 – DESCRIPTIVE STATISTICS
# 1) Load and inspect
# ===============================

# Imports (kept minimal for this section)
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  # used later for plots

# ---------- CONFIG ----------
INPUT_CSV = "smaller_online_retail_clean.csv"
EXCLUDE_ID_COLS = {"CustomerID", "InvoiceNo", "StockCode"}  # identifiers (not analyzed/plotted)

# ---------- LOAD ----------
print(f"Working directory: {os.getcwd()}")
if not os.path.exists(INPUT_CSV):
    raise FileNotFoundError(
        f"Could not find {INPUT_CSV}. Place it in this working directory or update INPUT_CSV."
    )

df = pd.read_csv(INPUT_CSV)
print(f"Loaded file: {INPUT_CSV}")
print(f"Shape (rows, cols): {df.shape}\n")

# ---------- QUICK PEEK ----------
print("First 10 rows:")
display(df.head(10))

print("\nColumn dtypes:")
display(df.dtypes)

# ---------- EXCLUDE IDENTIFIERS ----------
# Treat CustomerID, InvoiceNo, and StockCode as identifiers only (no stats/plots).
present_ids = [c for c in EXCLUDE_ID_COLS if c in df.columns]
if present_ids:
    print(f"\nIdentifier columns detected and excluded from analysis: {present_ids}")
else:
    print("\nNo identifier columns from the exclusion list were found.")

df_analysis = df.drop(columns=present_ids, errors="ignore")

# ---------- WHAT'S NUMERIC AFTER EXCLUSION ----------
numeric_cols = df_analysis.select_dtypes(include="number").columns.tolist()
print("\nNumeric columns available for descriptive statistics (after excluding IDs):")
print(numeric_cols)

# Helpful expectations for this dataset:
expected_num = {"Quantity", "UnitPrice", "SalesAmount"}
missing_expected = sorted(list(expected_num.difference(set(numeric_cols))))
if missing_expected:
    print("\n[Note] These expected numeric columns were not found as numeric "
          f"(or are missing): {missing_expected}")


Working directory: /home/ubuntu/MEGA/MIS/SEM_3/BI/A5
Loaded file: smaller_online_retail_clean.csv
Shape (rows, cols): (5398, 11)

First 10 rows:


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,SalesAmount,InvoiceMonth,IsReturn
0,580816,21378,SMALL TALL CAMPHOR WOOD TOADSTOOL,12,2011-12-06 11:28:00,0.39,15157.0,United Kingdom,4.68,2011-12,False
1,556129,23190,BUNDLE OF 3 SCHOOL EXERCISE BOOKS,48,2011-06-09 10:46:00,1.45,14060.0,United Kingdom,69.6,2011-06,False
2,561614,85099B,JUMBO BAG RED RETROSPOT,10,2011-07-28 12:50:00,2.08,13871.0,United Kingdom,20.8,2011-07,False
3,577358,23241,TREASURE TIN GYMKHANA DESIGN,3,2011-11-18 15:59:00,4.13,,United Kingdom,12.39,2011-11,False
4,561515,15058B,PINK POLKADOT GARDEN PARASOL,1,2011-07-27 15:16:00,10.79,,United Kingdom,10.79,2011-07,False
5,559506,22358,KINGS CHOICE TEA CADDY,1,2011-07-08 15:17:00,2.46,,United Kingdom,2.46,2011-07,False
6,570646,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,6,2011-10-11 12:49:00,7.95,12824.0,United Kingdom,47.7,2011-10,False
7,563038,84508A,CAMOUFLAGE DESIGN TEDDY,240,2011-08-11 15:05:00,2.1,16656.0,United Kingdom,504.0,2011-08,False
8,580677,22887,NUMBER TILE VINTAGE FONT 8,1,2011-12-05 14:40:00,1.95,16200.0,United Kingdom,1.95,2011-12,False
9,C574026,51014C,"FEATHER PEN,COAL BLACK",-1,2011-11-02 12:26:00,0.39,14606.0,United Kingdom,-0.39,2011-11,True



Column dtypes:


InvoiceNo        object
StockCode        object
Description      object
Quantity          int64
InvoiceDate      object
UnitPrice       float64
CustomerID      float64
Country          object
SalesAmount     float64
InvoiceMonth     object
IsReturn           bool
dtype: object


Identifier columns detected and excluded from analysis: ['CustomerID', 'StockCode', 'InvoiceNo']

Numeric columns available for descriptive statistics (after excluding IDs):
['Quantity', 'UnitPrice', 'SalesAmount']


In [2]:
# Columns to analyze
numeric_cols = ["Quantity", "UnitPrice", "SalesAmount"]

# Function to compute MAD (Mean Absolute Deviation from mean)
def mad_from_mean(series):
    mean_val = series.mean()
    return np.mean(np.abs(series - mean_val))

# Collect stats
stats_list = []

for col in numeric_cols:
    s = df[col].dropna()  # drop missing values
    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    iqr = q3 - q1
    
    stats = {
        "Variable": col,
        "Count": s.count(),
        "Mean": s.mean(),
        "Median": s.median(),
        "Min": s.min(),
        "Max": s.max(),
        "Range": s.max() - s.min(),
        "Variance": s.var(ddof=1),
        "StdDev": s.std(ddof=1),
        "MAD": mad_from_mean(s),
        "Q1": q1,
        "Q3": q3,
        "IQR": iqr,
        "Skewness": s.skew(),
        "Kurtosis": s.kurt()
    }
    stats_list.append(stats)

# Convert to DataFrame
stats_df = pd.DataFrame(stats_list)

# Save results
stats_df.to_csv("ba_outputs/descriptive_stats_report.csv", index=False)
stats_df


OSError: Cannot save file into a non-existent directory: 'ba_outputs'