In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [4]:
#import file - we used body measurements (posted in teams)
filepath = "data\\body measurements.csv"
df = pd.read_csv(filepath)

#clean data
df = df.dropna() # drop data with null values

#drop rows with outliers
float_cols = df.select_dtypes(include=['float64']).columns
Q1 = df[float_cols].quantile(0.25)
Q3 = df[float_cols].quantile(0.75)
IQR = Q3 - Q1
mask = ~((df[float_cols] < (Q1 - 1.5 * IQR)) | (df[float_cols] > (Q3 + 1.5 * IQR))).any(axis=1)
df_clean = df[mask]

#remove duplicate rows
df = df.drop_duplicates()

In [None]:

# -------------------------------
# 🧼 Step 1: Data Cleaning Function
# -------------------------------
def clean_data(df):
    df = df.dropna()
    float_cols = df.select_dtypes(include=['float64', 'int64']).columns
    Q1 = df[float_cols].quantile(0.25)
    Q3 = df[float_cols].quantile(0.75)
    IQR = Q3 - Q1
    mask = ~((df[float_cols] < (Q1 - 1.5 * IQR)) | (df[float_cols] > (Q3 + 1.5 * IQR))).any(axis=1)
    df = df[mask]
    df = df.drop_duplicates()
    return df

# -------------------------------
# 👤 Step 2: Ask User for Plots
# -------------------------------
def ask_user_for_plots(df):
    print("\n🧠 Columns in your data:")
    print(df.columns.tolist())
    
    plots = []
    
    hist_col = input("\n📊 Enter column name for histogram (or press Enter to skip): ")
    if hist_col in df.columns:
        plots.append(('hist', hist_col))
    
    scatter_cols = input("\n🔁 Enter two columns for scatter plot (comma separated, or press Enter to skip): ")
    if scatter_cols:
        try:
            x, y = [col.strip() for col in scatter_cols.split(',')]
            if x in df.columns and y in df.columns:
                plots.append(('scatter', x, y))
        except:
            print("❌ Invalid input, skipping scatter plot.")
    
    heatmap = input("\n🌡 Generate correlation heatmap? (yes/no): ")
    if heatmap.lower() == 'yes':
        plots.append(('heatmap',))
    
    return plots

# -------------------------------
# 📈 Step 3: Generate Dashboard
# -------------------------------
def generate_dashboard(df, plots, output_dir="dashboard_output"):
    os.makedirs(output_dir, exist_ok=True)
    
    for plot in plots:
        if plot[0] == 'hist':
            col = plot[1]
            plt.figure()
            df[col].hist(bins=30)
            plt.title(f"Histogram of {col}")
            plt.xlabel(col)
            plt.ylabel("Frequency")
            plt.tight_layout()
            plt.savefig(f"{output_dir}/hist_{col}.png")
            plt.close()
        
        elif plot[0] == 'scatter':
            x, y = plot[1], plot[2]
            plt.figure()
            df.plot(kind='scatter', x=x, y=y)
            plt.title(f"{x} vs {y}")
            plt.tight_layout()
            plt.savefig(f"{output_dir}/scatter_{x}_{y}.png")
            plt.close()
        
        elif plot[0] == 'heatmap':
            plt.figure(figsize=(10, 8))
            sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f", cmap="coolwarm")
            plt.title("Correlation Heatmap")
            plt.tight_layout()
            plt.savefig(f"{output_dir}/heatmap.png")
            plt.close()

    print(f"\n✅ Dashboard figures saved in '{output_dir}' folder!")

# -------------------------------
# 🧪 Step 4: Run the Full Pipeline
# -------------------------------
def main():
    filepath = "data\\body measurements.csv"  # Change path if needed
    print("📂 Loading file:", filepath)
    
    try:
        df = pd.read_csv(filepath)
        print("✅ File loaded successfully.")
    except Exception as e:
        print("❌ Error loading file:", e)
        return
    
    print("\n🧼 Cleaning data...")
    df_clean = clean_data(df)
    
    print("\n📋 Data cleaning complete.")
    print(f"🔢 Remaining rows after cleaning: {len(df_clean)}")
    
    print("\n📊 Let's build your dashboard!")
    selected_plots = ask_user_for_plots(df_clean)
    
    print("\n🎨 Generating visualizations...")
    generate_dashboard(df_clean, selected_plots)
    
    # Optional: Save cleaned data
    df_clean.to_csv("dashboard_output/cleaned_data.csv", index=False)
    print("📝 Cleaned data saved to 'dashboard_output/cleaned_data.csv'")
    
    print("\n🎉 Done! Explore your dashboard in the output folder.")

# -------------------------------
# 🚀 Entry Point
# -------------------------------
if __name__ == "__main__":
    main()

📂 Loading file: data\body measurements.csv
✅ File loaded successfully.

🧼 Cleaning data...

📋 Data cleaning complete.
🔢 Remaining rows after cleaning: 213

📊 Let's build your dashboard!

🧠 Columns in your data:
['index', 'Unnamed: 0', 'age', 'num_children', 'gender', 'reported_height', 'reported_weight', 'reported_pants_size_waist', 'reported_pants_size_inseam', 'bra_size_chest', 'bra_size_cup', 'age_range', 'shoe_size_us', 'ankle_circum', 'spine_to_scye_len', 'spine_to_elbow_len', 'arm_len_spine_wrist', 'arm_len_shoulder_wrist', 'arm_len_shoulder_elbow', 'scye_circum', 'chest_circum', 'chest_circum_below_bust', 'chest_circum_scye', 'jean_inseam', 'hand_len', 'hip_circum', 'hip_height', 'neck_circum_base', 'bideltoid_breadth', 'height', 'thigh_circum_proximal', 'u_crotch', 'waist_circum_preferred', 'waist_height_preferred', 'weight', 'biacromial_breadth', 'bicristal_breadth', 'bust_to_bust', 'cervical_height', 'chest_height', 'interscye_dist', 'acromion_height', 'acromion_radial_len', 

In [11]:
import pandas as pd
import numpy as np
import plotly.express as px
from dash import Dash, dcc, html, Input, Output

# --- Data Cleaning ---
def clean_data(df):
    df = df.dropna()  # Remove missing values
    df = df.drop_duplicates()  # Remove duplicates
    
    # Remove outliers using IQR
    float_cols = df.select_dtypes(include=['float64', 'int']).columns
    Q1 = df[float_cols].quantile(0.25)
    Q3 = df[float_cols].quantile(0.75)
    IQR = Q3 - Q1
    mask = ~((df[float_cols] < (Q1 - 1.5 * IQR)) | (df[float_cols] > (Q3 + 1.5 * IQR))).any(axis=1)
    df_clean = df[mask]
    return df_clean

# --- Load & Clean Data ---
filepath = "data\\body measurements.csv"
df = pd.read_csv(filepath)
df_clean = clean_data(df)

# --- Launch Dashboard ---
app = Dash(__name__)
numeric_cols = df_clean.select_dtypes(include=['float64', 'int']).columns

app.layout = html.Div([
    html.H1("Automated Dashboard for Cleaned Data"),
    
    html.Label("Choose Plot Type:"),
    dcc.Dropdown(
        options=[
            {'label': 'Histogram', 'value': 'hist'},
            {'label': 'Box Plot', 'value': 'box'},
            {'label': 'Scatter Plot', 'value': 'scatter'}
        ],
        id='plot-type',
        value='hist'
    ),
    
    html.Label("X-axis:"),
    dcc.Dropdown(id='x-axis', options=[{'label': col, 'value': col} for col in numeric_cols], value=numeric_cols[0]),
    
    html.Label("Y-axis (only for Scatter Plot):"),
    dcc.Dropdown(id='y-axis', options=[{'label': col, 'value': col} for col in numeric_cols], value=numeric_cols[1]),
    
    dcc.Graph(id='graph')
])

@app.callback(
    Output('graph', 'figure'),
    Input('plot-type', 'value'),
    Input('x-axis', 'value'),
    Input('y-axis', 'value')
)
def update_graph(plot_type, x, y):
    if plot_type == 'hist':
        fig = px.histogram(df_clean, x=x)
    elif plot_type == 'box':
        fig = px.box(df_clean, y=x)
    elif plot_type == 'scatter':
        fig = px.scatter(df_clean, x=x, y=y)
    else:
        fig = {}
    return fig

if __name__ == '__main__':
    app.run(debug=True)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[11], line 60, in update_graph(
    plot_type='hist',
    x=None,
    y='Unnamed: 0'
)
     52 @app.callback(
     53     Output('graph', 'figure'),
     54     Input('plot-type', 'value'),
   (...)
     57 )
     58 def update_graph(plot_type, x, y):
     59     if plot_type == 'hist':
---> 60         fig = px.histogram(df_clean, x=x)
        df_clean =       index  Unnamed: 0   age  num_children  gender  reported_height  \
73       73     11001.0  39.0           2.0  female            68.11   
89       89     11035.0  38.0           1.0  female            68.90   
93       93     11042.0  57.0           0.0  female            66.14   
99       99     11054.0  37.0           0.0  female            68.90   
116     116     11096.0  24.0           0.0  female            67.72   
...     ...         ...   ...           ...     ... 