In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import io

# --- Configuration ---
FILE_PATH = 'my_master_dataset.csv'
PLOT_DIR = 'eda_plots'

def run_eda_analysis(file_path):
    """
    Performs a full EDA, visualization, and preprocessing check 
    on the specified 'combined_data.csv' file.
    """
    print(f"--- Starting Analysis for '{file_path}' ---")

    # --- 1. Setup ---
    # Create a directory to save all plots
    if not os.path.exists(PLOT_DIR):
        os.makedirs(PLOT_DIR)
        print(f"Created directory for plots: ./{PLOT_DIR}/")

    # --- 2. Load Data ---
    try:
        df = pd.read_csv(file_path)
        print(f"\nSuccessfully loaded '{file_path}'. Shape: {df.shape}")
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'.")
        print("Please make sure 'combined_data.csv' is in the same directory.")
        return
    except Exception as e:
        print(f"An error occurred while loading the file: {e}")
        return

    # --- 3. Initial Inspection (Sanity Check) ---
    print("\n--- 3. Initial Inspection ---")
    print("--- Head (First 5 Rows) ---")
    print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

    print("\n--- Data Info (Types & Non-Null Counts) ---")
    # Capture .info() output to print it cleanly
    buf = io.StringIO()
    df.info(buf=buf)
    print(buf.getvalue())

    print("\n--- Descriptive Statistics (All Columns) ---")
    # .to_markdown() provides cleaner console output
    print(df.describe(include='all').to_markdown(numalign="left", stralign="left"))

    # --- 4. Missing Value Analysis ---
    print("\n--- 4. Missing Value Analysis ---")
    missing_percent = (df.isnull().sum() / len(df)) * 100
    missing_percent = missing_percent[missing_percent > 0].sort_values(ascending=False)
    
    if missing_percent.empty:
        print("No missing values found. Excellent!")
    else:
        print("Missing Value Percentages:")
        print(missing_percent.to_frame(name='% Missing').to_markdown(numalign="left", stralign="left"))
        
        # Plot missing value heatmap
        plt.figure(figsize=(15, 8))
        sns.heatmap(df.isnull(), cbar=False, yticklabels=False, cmap='viridis')
        plt.title('Missing Value Heatmap')
        plot_path = os.path.join(PLOT_DIR, '01_missing_value_heatmap.png')
        plt.savefig(plot_path)
        plt.close()
        print(f"Saved '01_missing_value_heatmap.png' to {PLOT_DIR}")

    # --- 5. EDA & Visualization (Saving Plots) ---
    print("\n--- 5. EDA & Visualization ---")
    
    # Identify column types
    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    print(f"\nIdentified {len(numerical_cols)} Numerical Columns: {numerical_cols}")
    print(f"Identified {len(categorical_cols)} Categorical Columns: {categorical_cols}")

    # 5.1 Univariate Analysis - Numerical
    print("\nGenerating plots for numerical features...")
    for col in numerical_cols:
        try:
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
            fig.suptitle(f'Univariate Analysis: {col}', fontsize=16)
            
            # Histogram / Distplot
            sns.histplot(df[col], kde=True, ax=ax1, bins=30)
            ax1.set_title('Distribution (Histogram)')
            
            # Boxplot
            sns.boxplot(x=df[col], ax=ax2)
            ax2.set_title('Outlier Check (Boxplot)')
            
            plot_path = os.path.join(PLOT_DIR, f'02_univariate_num_{col}.png')
            plt.savefig(plot_path)
            plt.close(fig)
        except Exception as e:
            print(f"Could not plot numerical feature '{col}'. Error: {e}")
            
    print(f"Saved {len(numerical_cols)} numerical plots to {PLOT_DIR}")

    # 5.2 Univariate Analysis - Categorical
    print("\nGenerating plots for categorical features...")
    for col in categorical_cols:
        try:
            plt.figure(figsize=(12, 8))
            
            # If high cardinality, plot top 20 only
            if df[col].nunique() > 20:
                top_categories = df[col].value_counts().nlargest(20).index
                sns.countplot(y=col, data=df[df[col].isin(top_categories)], order=top_categories)
                plt.title(f'Categorical Analysis: {col} (Top 20 Categories)')
            else:
                sns.countplot(y=col, data=df, order=df[col].value_counts().index)
                plt.title(f'Categorical Analysis: {col}')
                
            plt.tight_layout()
            plot_path = os.path.join(PLOT_DIR, f'03_univariate_cat_{col}.png')
            plt.savefig(plot_path)
            plt.close()
        except Exception as e:
            print(f"Could not plot categorical feature '{col}'. Error: {e}")
            
    print(f"Saved {len(categorical_cols)} categorical plots to {PLOT_DIR}")

    # 5.3 Multivariate Analysis - Correlation Heatmap
    print("\nGenerating correlation heatmap...")
    if len(numerical_cols) > 1:
        plt.figure(figsize=(16, 10))
        corr = df[numerical_cols].corr()
        mask = np.triu(np.ones_like(corr, dtype=bool)) # Mask for upper triangle
        sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', center=0, annot_kws={"size": 8})
        plt.title('Numerical Feature Correlation Heatmap')
        plt.tight_layout()
        plot_path = os.path.join(PLOT_DIR, '04_correlation_heatmap.png')
        plt.savefig(plot_path)
        plt.close()
        print(f"Saved '04_correlation_heatmap.png' to {PLOT_DIR}")
    else:
        print("Not enough numerical features (need at least 2) to generate a correlation heatmap.")

    # --- 6. Preprocessing Demonstration (on Combined Data) ---
    print("\n--- 6. Preprocessing Demonstration ---")
    print("This section demonstrates common preprocessing steps on the combined data.")
    print("In a real ML pipeline, you would fit on TRAIN data and transform both TRAIN and TEST.")
    
    # Create a copy to show preprocessing
    df_proc = df.copy()

    # 6.1 Imputation (Example)
    print("\nHandling missing values (Imputation)...")
    
    num_cols_missing = [col for col in numerical_cols if df_proc[col].isnull().any()]
    cat_cols_missing = [col for col in categorical_cols if df_proc[col].isnull().any()]
    
    if num_cols_missing:
        print(f"Imputing numerical columns with MEDIAN: {num_cols_missing}")
        for col in num_cols_missing:
            median_val = df_proc[col].median()
            df_proc[col] = df_proc[col].fillna(median_val)
    
    if cat_cols_missing:
        print(f"Imputing categorical columns with MODE: {cat_cols_missing}")
        for col in cat_cols_missing:
            mode_val = df_proc[col].mode()[0]
            df_proc[col] = df_proc[col].fillna(mode_val)

    if not num_cols_missing and not cat_cols_missing:
        print("No missing values to impute.")

    # 6.2 Encoding (Example)
    print("\nHandling categorical features (One-Hot Encoding)...")
    if categorical_cols:
        print(f"Applying one-hot encoding to: {categorical_cols}")
        df_proc = pd.get_dummies(df_proc, columns=categorical_cols, drop_first=True)
    else:
        print("No categorical columns to encode.")
    
    # --- 7. Final Check (Post-Preprocessing) ---
    print("\n--- 7. Final Check (Post-Preprocessing) ---")
    print("--- Processed Data Head ---")
    print(df_proc.head().to_markdown(index=False, numalign="left", stralign="left"))
    
    print("\n--- Processed Data Info (Final) ---")
    buf = io.StringIO()
    df_proc.info(buf=buf)
    print(buf.getvalue())
    
    print("\n--- Sanity Check Complete ---")
    print(f"All plots saved to the '{PLOT_DIR}' directory.")

# --- Main execution ---
if __name__ == "__main__":
    run_eda_analysis(FILE_PATH)


--- Starting Analysis for 'my_master_dataset.csv' ---
Created directory for plots: ./eda_plots/

Successfully loaded 'my_master_dataset.csv'. Shape: (18147, 17)

--- 3. Initial Inspection ---
--- Head (First 5 Rows) ---
| flight_id            | timestamp   | att_roll    | att_pitch   | att_yaw    | pos_alt_rel   | pos_vx   | pos_vy   | pos_vz   | nav_roll     | nav_pitch   | nav_alt_error   | sys_load   | vib_x      | vib_y      | vib_z      | label   |
|:---------------------|:------------|:------------|:------------|:-----------|:--------------|:---------|:---------|:---------|:-------------|:------------|:----------------|:-----------|:-----------|:-----------|:-----------|:--------|
| normal_corridor_base | 15859       | 0.000336389 | 0.000120005 | -0.070875  | 0             | 0        | 0        | 0        | -9.94215e-05 | 9.44494e-05 | 0               | 0          | 0.00266403 | 0.0027921  | 0.00258406 | 0       |
| normal_corridor_base | 16109       | 0.000345758 | 0.000129839 |

In [3]:
df = pd.read_csv(FILE_PATH)

In [6]:
df.shape

(18147, 17)

In [4]:
df.head()

Unnamed: 0,flight_id,timestamp,att_roll,att_pitch,att_yaw,pos_alt_rel,pos_vx,pos_vy,pos_vz,nav_roll,nav_pitch,nav_alt_error,sys_load,vib_x,vib_y,vib_z,label
0,normal_corridor_base,15859,0.000336,0.00012,-0.070875,0.0,0.0,0.0,0.0,-9.9e-05,9.4e-05,0.0,0.0,0.002664,0.002792,0.002584,0
1,normal_corridor_base,16109,0.000346,0.00013,-0.06823,0.0,0.0,0.0,0.0,-9.9e-05,9.4e-05,0.0,0.0,0.00265,0.002887,0.0025,0
2,normal_corridor_base,16359,0.000354,0.000138,-0.065912,0.0,0.0,0.0,0.0,-9.9e-05,9.4e-05,0.0,0.0,0.002905,0.002861,0.002682,0
3,normal_corridor_base,16609,0.000366,0.000145,-0.063886,0.0,0.0,0.0,0.0,-9.9e-05,9.4e-05,0.0,0.0,0.00307,0.002802,0.00275,0
4,normal_corridor_base,16859,0.000376,0.000154,-0.0621,0.0,0.0,0.0,0.0,-9.9e-05,9.4e-05,0.0,0.0,0.003136,0.002573,0.002589,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18147 entries, 0 to 18146
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   flight_id      18147 non-null  object 
 1   timestamp      18147 non-null  int64  
 2   att_roll       18147 non-null  float64
 3   att_pitch      18147 non-null  float64
 4   att_yaw        18147 non-null  float64
 5   pos_alt_rel    18147 non-null  float64
 6   pos_vx         18147 non-null  float64
 7   pos_vy         18147 non-null  float64
 8   pos_vz         18147 non-null  float64
 9   nav_roll       18147 non-null  float64
 10  nav_pitch      18147 non-null  float64
 11  nav_alt_error  18147 non-null  float64
 12  sys_load       18147 non-null  float64
 13  vib_x          18147 non-null  float64
 14  vib_y          18147 non-null  float64
 15  vib_z          18147 non-null  float64
 16  label          18147 non-null  int64  
dtypes: float64(14), int64(2), object(1)
memory usage: 

In [8]:
df.describe

<bound method NDFrame.describe of                     flight_id  timestamp  att_roll  att_pitch   att_yaw  \
0        normal_corridor_base      15859  0.000336   0.000120 -0.070875   
1        normal_corridor_base      16109  0.000346   0.000130 -0.068230   
2        normal_corridor_base      16359  0.000354   0.000138 -0.065912   
3        normal_corridor_base      16609  0.000366   0.000145 -0.063886   
4        normal_corridor_base      16859  0.000376   0.000154 -0.062100   
...                       ...        ...       ...        ...       ...   
18142  attack_corridor_base_5     260857  0.008967  -0.453047  1.561377   
18143  attack_corridor_base_5     261107 -0.014619  -0.462179  1.562447   
18144  attack_corridor_base_5     261357 -0.004446  -0.499337  1.560480   
18145  attack_corridor_base_5     261607  0.026910  -0.501120  1.556983   
18146  attack_corridor_base_5     261857  0.048641  -0.492745  1.555912   

       pos_alt_rel  pos_vx  pos_vy  pos_vz  nav_roll  nav_pitch  

In [9]:
df.isna().sum()

flight_id        0
timestamp        0
att_roll         0
att_pitch        0
att_yaw          0
pos_alt_rel      0
pos_vx           0
pos_vy           0
pos_vz           0
nav_roll         0
nav_pitch        0
nav_alt_error    0
sys_load         0
vib_x            0
vib_y            0
vib_z            0
label            0
dtype: int64

In [19]:
categorical_vars = df.select_dtypes(include = ['object','category']).columns.tolist()
numerical_vars = df.select_dtypes(include=['number']).columns.tolist()

print(f"Number of categorical variables {len(categorical_vars)}")
print(categorical_vars)
print()
print(f"Number of numerical variables {len(numerical_vars)}")
print(numerical_vars)

Number of categorical variables 1
['flight_id']

Number of numerical variables 16
['timestamp', 'att_roll', 'att_pitch', 'att_yaw', 'pos_alt_rel', 'pos_vx', 'pos_vy', 'pos_vz', 'nav_roll', 'nav_pitch', 'nav_alt_error', 'sys_load', 'vib_x', 'vib_y', 'vib_z', 'label']
