In [8]:
import sys
import pandas as pd
sys.path.append('../src')
from data_loader import get_coffee_sales_df

In [9]:
df = get_coffee_sales_df()

In [7]:
if df is not None:
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print("\nFirst 5 rows:")
    print(df.head())

    print("\nData types:")
    print(df.dtypes)

    print("\nBasic statistics:")
    print(df.describe())

Dataset shape: (262, 5)
Columns: ['date', 'datetime', 'cash_type', 'money', 'coffee_name']

First 5 rows:
         date             datetime cash_type  money            coffee_name
0  2025-02-08  2025-02-08 14:26:04      cash   15.0                    Tea
1  2025-02-08  2025-02-08 14:28:26      cash   15.0                    Tea
2  2025-02-08  2025-02-08 14:33:04      card   20.0               Espresso
3  2025-02-08  2025-02-08 15:51:04      card   30.0  Chocolate with coffee
4  2025-02-08  2025-02-08 16:35:01      cash   27.0    Chocolate with milk

Data types:
date            object
datetime        object
cash_type       object
money          float64
coffee_name     object
dtype: object

Basic statistics:
           money
count  262.00000
mean    26.29771
std      4.25021
min     15.00000
25%     25.00000
50%     27.00000
75%     29.00000
max     33.00000


In [10]:
print("=== MISSING VALUES ANALYSIS ===")

missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Missing_Percentage': missing_percentage.values
})

print("Missing values summary:")
print(missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False))

=== MISSING VALUES ANALYSIS ===
Missing values summary:
Empty DataFrame
Columns: [Column, Missing_Count, Missing_Percentage]
Index: []


In [11]:
print("=== DATA TYPES AND BASIC INFO ===")
print("\nData types:")
print(df.dtypes)

print("\nUnique values per column:")
for col in df.columns:
    unique_count = df[col].nunique()
    print(f"{col}: {unique_count} unique values")

print("\nSample values from each column:")
for col in df.columns:
    print(f"\n{col}:")
    print(df[col].value_counts().head())

=== DATA TYPES AND BASIC INFO ===

Data types:
date            object
datetime        object
cash_type       object
money          float64
coffee_name     object
dtype: object

Unique values per column:
date: 41 unique values
datetime: 254 unique values
cash_type: 2 unique values
money: 10 unique values
coffee_name: 30 unique values

Sample values from each column:

date:
date
2025-02-26    17
2025-02-09    16
2025-02-20    11
2025-03-15    11
2025-03-02    11
Name: count, dtype: int64

datetime:
datetime
2025-02-09 18:01:22    3
2025-02-26 09:41:12    2
2025-02-16 16:41:18    2
2025-02-22 18:38:38    2
2025-03-23 10:25:48    2
Name: count, dtype: int64

cash_type:
cash_type
card    182
cash     80
Name: count, dtype: int64

money:
money
25.0    85
27.0    36
28.0    36
30.0    27
33.0    25
Name: count, dtype: int64

coffee_name:
coffee_name
Americano with milk    44
Latte                  24
Irish whiskey          21
Espresso               18
Tea                    16
Name: count, dt

In [13]:
print("=== STATISTICAL SUMMARY ===")
print("\nNumerical columns summary:")
print(df.describe())

print("\nCategorical columns summary:")
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())

=== STATISTICAL SUMMARY ===

Numerical columns summary:
           money
count  262.00000
mean    26.29771
std      4.25021
min     15.00000
25%     25.00000
50%     27.00000
75%     29.00000
max     33.00000

Categorical columns summary:

date:
date
2025-02-26    17
2025-02-09    16
2025-02-20    11
2025-03-15    11
2025-03-02    11
2025-02-21    11
2025-03-22    10
2025-02-28     9
2025-02-14     9
2025-03-12     8
2025-03-09     8
2025-03-05     8
2025-02-08     8
2025-03-23     8
2025-02-17     8
2025-02-16     8
2025-02-15     7
2025-03-18     6
2025-03-03     6
2025-03-19     5
2025-03-17     5
2025-03-01     5
2025-03-07     5
2025-03-11     5
2025-03-10     4
2025-03-21     4
2025-03-16     4
2025-03-14     4
2025-02-19     4
2025-02-22     4
2025-03-08     4
2025-02-18     4
2025-02-24     4
2025-02-23     4
2025-02-13     4
2025-03-20     3
2025-02-10     3
2025-02-11     2
2025-02-27     2
2025-02-25     2
2025-02-12     1
Name: count, dtype: int64

datetime:
datetime
2025-0