# Libraries

In [3]:
import pandas as pd
import os

# Data Import

## Get data and create dataframes

In [54]:
def getDataFrames(files_names, data_dir):
    """
    Reads multiple CSV files and returns a list of DataFrames.

    Parameters:
        file_name_pairs (list of tuples): Each tuple contains (file_name, df_name) — df_name is ignored here.
        data_dir (str): Directory where the CSV files are stored.

    Returns:
        list: List of pandas DataFrames.
    """
    dataframes = {}

    for file_name, df_name in files_names:
        try:
            file_path = os.path.join(data_dir, file_name)
            df = pd.read_csv(file_path)
            dataframes[df_name] = df
        except FileNotFoundError:
            print(f"File not found: {file_name}")
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

    return dataframes

## Show all DataFrames

In [28]:
def showDataFrame(files_names):
    #Import and print data  
    try:
        # Loop through the files and read them
        for file_name, df_name in files_names:
            df = pd.read_csv(f"{DATA_DIR}{file_name}")
            print(f"{df_name} DataFrame \n")
            display(df.head())
            
    except FileNotFoundError:
        print("One or more CSV files not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Data exploration

## Shape, data types, missing values

### Various dataframes

```python
#Function to explore a list of dataframes
def exploreDataFrames(dataframe_list,dataframe_names):
    for i, df in enumerate(dataframe_list):
        print(f"DataFrame: {dataframe_names[i]}")
        print("\nShape:", df.shape,"\n")
        print("\nData Types:\n", df.dtypes, "\n")
        # print("\nDescriptive Statistics:\n", df.describe(include='all'), "\n")
        print("\nMissing Values:\n", df.isnull().sum(), "\n")
        print("-" * 70)
``` 

In [59]:
# Function to explore a dictionary of dataframes
def exploreDataFrames(dataframe_dict):
    for name, df in dataframe_dict.items():
        print(f"\nDataFrame: {name}")
        print("Shape:", df.shape)
        print("\nData Types:\n", df.dtypes)
        #print("\nDescriptive Statistics:\n", df.describe(include='all'))
        print("\nMissing Values:\n", df.isnull().sum())
        print("-" * 70)

### Single dataframe

In [65]:
# Function to explore a single DataFrame
def exploreDataFrame(df, df_name):
    print(f"\nExploring DataFrame: {df_name}\n")
    print("Shape:", df.shape,"\n")
    print("Data Types:\n", df.dtypes, "\n")
    #print("\nDescriptive Statistics:\n", df.describe(include='all'), "\n")
    print("Missing Values:\n", df.isnull().sum(), "\n")


### summarizeDataframe
Python function that takes a pandas DataFrame as input and returns a new DataFrame summarizing the following information for each column:

    Data Type

    Number of Missing Values

    Missing Values %

    Minimum Value



In [1]:
"""import pandas as pd

def summarizeDataFrame1(df: pd.DataFrame) -> pd.DataFrame:
    summary = pd.DataFrame(index=df.columns)
    
    summary['Data Type'] = df.dtypes
    summary['Number of Missing Values'] = df.isnull().sum()
    summary['Missing Values %'] = (df.isnull().mean() * 100).round(2)
    
    # Initialize min and max with None, to be conditionally filled
    summary['Minimum Value'] = None
    summary['Maximum Value'] = None

    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            summary.at[col, 'Minimum Value'] = df[col].min()
            summary.at[col, 'Maximum Value'] = df[col].max()

    return summary
"""

Order and naming format you specified:

    Column Name
    Data type
    Count
    Missing Values (#)
    Missing Values (%)
    Mean
    STD
    Min
    25%
    50%
    75%
    Max

The function handles numeric columns for statistical calculations and returns None for non-numeric columns in those fields.

In [None]:
def summarizeDataFrame(df: pd.DataFrame) -> pd.DataFrame:
    summary_data = []

    for col in df.columns:
        col_data = df[col]
        data_type = col_data.dtype
        total_count = col_data.count()
        missing_count = col_data.isnull().sum()
        missing_percent = round((missing_count / len(df)) * 100, 2)

        if pd.api.types.is_numeric_dtype(col_data):
            mean = col_data.mean()
            std = col_data.std()
            min_val = col_data.min()
            q25 = col_data.quantile(0.25)
            q50 = col_data.quantile(0.50)
            q75 = col_data.quantile(0.75)
            max_val = col_data.max()
        else:
            mean = std = min_val = q25 = q50 = q75 = max_val = None

        summary_data.append({
            "Column Name": col,
            "Data type": data_type,
            "Count": total_count,
            "Missing Values (#)": missing_count,
            "Missing Values (%)": missing_percent,
            "Mean": mean,
            "STD": std,
            "Min": min_val,
            "25%": q25,
            "50%": q50,
            "75%": q75,
            "Max": max_val
        })

    summary_df = pd.DataFrame(summary_data)
    return summary_df


## Analyze features
* Numerical features and descriptive statistics
* Categorical features (counts)

### Various dataframes

```python
# Analyze features from a list of dataframes
def analyzeDFsFeatures(dataframe_list,dataframe_names):
    for i, df in enumerate(dataframe_list):
        print(f"DataFrame: {dataframe_names[i]}")

        # Analyze Numerical Features
        numerical_cols = df.select_dtypes(include=['number']).columns
        if len(numerical_cols) > 0:
            print("\nNumerical Features:")
            print(df[numerical_cols].describe())
        else:
            print("\nNo Numerical Features")
    
        # Analyze Categorical Features
        categorical_cols = df.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            print(f"\nCategorical Feature: {col}")
            print(df[col].value_counts())
        print("-" * 50)
```

In [68]:
# Analyze features from a dictionary of dataframes
def analyzeDFsFeatures(dataframe_dict):
    for name, df in dataframe_dict.items():
        print(f"\nDataFrame: {name}\n")
        # Analyze Numerical Features
        numerical_cols = df.select_dtypes(include=['number']).columns
        if len(numerical_cols) > 0:
            print("\nNumerical Features:\n")
            print(df[numerical_cols].describe())
        else:
            print("\nNo Numerical Features:\n")
    
        # Analyze Categorical Features
        categorical_cols = df.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            print(f"\nCategorical Feature: {col}")
            print(df[col].value_counts())
        print("-" * 50)

### Single dataframe

In [67]:
# Analyze features from a single dataframe
def analyzeDFFeatures(df,df_name):
    print(f"DataFrame: {df_name}\n")
    
    # Analyze Numerical Features
    numerical_cols = df.select_dtypes(include=['number']).columns
    if len(numerical_cols) > 0:
        print("\nNumerical Features:\n")
        print(df[numerical_cols].describe())
    else:
        print("\nNo Numerical Features:\n")

    # Analyze Categorical Features
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        print(f"\nCategorical Feature: {col}")
        print(df[col].value_counts())
    print("-" * 70)

## Find outliers

### Various dataframes

In [69]:
# Function to find outliers in dictionary of DataFrames
def findOutliersDFs(dataframe_dict):
    for name, df in dataframe_dict.items():
        print(f"\nDataFrame: {name}\n")
        # Analyze Numerical Features
        numerical_cols = df.select_dtypes(include=['number']).columns
        # Check for outliers in telemetry data using IQR
        for col in numerical_cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
            print(f"\nPotential outliers in {col}:\n", outliers.shape[0])
        print("-" * 70)


### Single Dataframe

In [70]:
# Function to find outliers in a single DataFrame
def findOutliersDF(df, df_name):
    # Check for outliers in telemetry data using IQR
    numerical_cols = df.select_dtypes(include=['number']).columns
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        print(f"\nPotential outliers in {col}:\n", outliers.shape[0])
 
        

## Distribution

### Boxplots

In [8]:
def matplotlibBoxplot(df,columns):
    df[columns].plot(kind='box', figsize=(12, 8))
    plt.tight_layout()
    plt.show()

In [11]:
def seabornBoxplot(df, columns):
    plt.figure(figsize=(12, 8))
    sns.boxplot(data=df[columns], orient='v', palette='Set2') # orient h (horizontal) v (vertical)
    plt.title("Boxplot of Selected Columns")
    plt.xlabel("Value")
    plt.tight_layout()
    plt.show()


### Check Distribution and Normality

In [4]:
#import numpy as np
#from scipy.stats import shapiro, normaltest, probplot

def checkDataFrameNormality(df, alpha=0.05, show_plots=True, max_sample_size=5000):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    results = []

    for col in numeric_cols:
        data = df[col].dropna()

        if len(data) < 8:
            print(f"⚠️ Skipping '{col}': Not enough non-null values (<8).")
            continue

        """Downsample large data for Shapiro (not reliable above 5000)
        shapiro_sample = data.sample(min(len(data), max_sample_size), random_state=42)
        try:
            shapiro_p = shapiro(shapiro_sample).pvalue
        except Exception:
            shapiro_p = np.nan"""

        # D’Agostino test (OK for large samples)
        try:
            dagostino_p = normaltest(data).pvalue
        except Exception:
            dagostino_p = np.nan

        is_normal = (shapiro_p > alpha if not np.isnan(shapiro_p) else False) and \
                    (dagostino_p > alpha if not np.isnan(dagostino_p) else False)

        results.append({
            'Column': col,
            'Non-Null Count': len(data),
            'Shapiro p-value': round(shapiro_p, 4) if not np.isnan(shapiro_p) else 'n/a',
            'D’Agostino p-value': round(dagostino_p, 4) if not np.isnan(dagostino_p) else 'n/a',
            'Likely Normal?': '✅ Yes' if is_normal else '❌ No'
        })

        # Optional visualization
        if show_plots:
            print(f"\n📊 Analyzing: {col}")
            fig, axes = plt.subplots(1, 2, figsize=(12, 4))

            # Histogram with KDE
            sns.histplot(data, kde=True, ax=axes[0], color='skyblue')
            axes[0].set_title(f"{col} - Histogram + KDE")

            # Q-Q Plot
            probplot(data, dist="norm", plot=axes[1])
            axes[1].set_title(f"{col} - Q-Q Plot")

            plt.tight_layout()
            plt.show()

    result_df = pd.DataFrame(results)
    return result_df.sort_values(by='Likely Normal?', ascending=False).reset_index(drop=True)

## Visualizations

### Line Plots

# Data Pre-processing

# Feature Engineering