In [5]:
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import re

#Cleans column names for filenames
def clean_filename(name):
    return re.sub(r'[^\w\-_.]', '_', name)

# Create 'plots' directory if it doesn't exist
os.makedirs("EDA plots", exist_ok=True)


df = pd.read_csv('Glucose_export.csv')

df['Timestamp'] = pd.to_datetime(df['Timestamp'], dayfirst=True, errors='coerce')
if 'Timestamp' in df.columns:
    df = df.drop(columns=['Timestamp'])

print("Basic Stats:\n", df.describe())
missing_values = df.isnull().sum()
missing_values_sorted = missing_values.sort_values(ascending=False)
print(missing_values_sorted)
print(df.median(numeric_only=True))


numeric_cols = df.select_dtypes(include='number').columns
for col in numeric_cols:
    data = df[col].dropna()
    if data.nunique() < 2:
        continue

    try:
        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        sns.histplot(data, kde=True, bins=20, color='skyblue')
        plt.title(f'Histogram of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')

        # Boxplot
        plt.subplot(1, 2, 2)
        sns.boxplot(x=data, color='lightgreen')
        plt.title(f'Boxplot of {col}')
        plt.xlabel(col)

        plt.tight_layout()
        filename = clean_filename(col)
        plt.savefig(f"EDA plots/{filename}_distplot.png", dpi=300, bbox_inches='tight')
        plt.close()
    except Exception as e:
        print(f"Skipping {col} due to error: {e}")

Basic Stats:
        Glucose value (mmol/l)  Duration (minutes)  Insuline units (basal)  \
count            25925.000000        12655.000000            12655.000000   
mean                 8.895869           10.102252                0.815612   
std                  2.868333           16.927186                0.572740   
min                  2.300000            1.000000                0.000000   
25%                  6.700000            5.000000                0.462000   
50%                  8.400000            5.000000                0.685000   
75%                 10.700000            5.000000                1.100500   
max                 20.400000          280.000000                3.609000   

       BG_input (mmol/l)  Carbohydrates (g)  Carb ratio  \
count        1106.000000        1106.000000  731.000000   
mean           10.551552          15.297468   12.777018   
std             3.175269          15.163636    0.975490   
min             0.000000           0.000000   12.000000 