# Libraries

In [3]:
import pandas as pd
import os

# Data Import

## Get data and create dataframes

In [54]:
def getDataFrames(files_names, data_dir):
    """
    Reads multiple CSV files and returns a list of DataFrames.

    Parameters:
        file_name_pairs (list of tuples): Each tuple contains (file_name, df_name) — df_name is ignored here.
        data_dir (str): Directory where the CSV files are stored.

    Returns:
        list: List of pandas DataFrames.
    """
    dataframes = {}

    for file_name, df_name in files_names:
        try:
            file_path = os.path.join(data_dir, file_name)
            df = pd.read_csv(file_path)
            dataframes[df_name] = df
        except FileNotFoundError:
            print(f"File not found: {file_name}")
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

    return dataframes

## Show all DataFrames

In [28]:
def showDataFrame(files_names):
    #Import and print data  
    try:
        # Loop through the files and read them
        for file_name, df_name in files_names:
            df = pd.read_csv(f"{DATA_DIR}{file_name}")
            print(f"{df_name} DataFrame \n")
            display(df.head())
            
    except FileNotFoundError:
        print("One or more CSV files not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Data exploration

## Shape, data types, missing values

### Various dataframes

```python
#Function to explore a list of dataframes
def exploreDataFrames(dataframe_list,dataframe_names):
    for i, df in enumerate(dataframe_list):
        print(f"DataFrame: {dataframe_names[i]}")
        print("\nShape:", df.shape,"\n")
        print("\nData Types:\n", df.dtypes, "\n")
        # print("\nDescriptive Statistics:\n", df.describe(include='all'), "\n")
        print("\nMissing Values:\n", df.isnull().sum(), "\n")
        print("-" * 70)
``` 

In [59]:
# Function to explore a dictionary of dataframes
def exploreDataFrames(dataframe_dict):
    for name, df in dataframe_dict.items():
        print(f"\nDataFrame: {name}")
        print("Shape:", df.shape)
        print("\nData Types:\n", df.dtypes)
        #print("\nDescriptive Statistics:\n", df.describe(include='all'))
        print("\nMissing Values:\n", df.isnull().sum())
        print("-" * 70)

### Single dataframe

In [49]:
# Function to explore a single DataFrame
def exploreDataFrame(df, df_name):
    print(f"\nExploring DataFrame: {df_name}")
    print("Shape:", df.shape,"\n")
    print("Data Types:\n", df.dtypes, "\n")
    print("\nDescriptive Statistics:\n", df.describe(include='all'), "\n")
    print("Missing Values:\n", df.isnull().sum(), "\n")


## Analyze features
* Numerical features and descriptive statistics
* Categorical features (counts)

### Various dataframes

```python
# Analyze features from a list of dataframes
def analyzeDFsFeatures(dataframe_list,dataframe_names):
    for i, df in enumerate(dataframe_list):
        print(f"DataFrame: {dataframe_names[i]}")

        # Analyze Numerical Features
        numerical_cols = df.select_dtypes(include=['number']).columns
        if len(numerical_cols) > 0:
            print("\nNumerical Features:")
            print(df[numerical_cols].describe())
        else:
            print("\nNo Numerical Features")
    
        # Analyze Categorical Features
        categorical_cols = df.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            print(f"\nCategorical Feature: {col}")
            print(df[col].value_counts())
        print("-" * 50)
```

In [63]:
# Analyze features from a dictionary of dataframes
def analyzeDFsFeatures(dataframe_dict):
    for name, df in dataframe_dict.items():
        print(f"\nDataFrame: {name}\n")
        # Analyze Numerical Features
        numerical_cols = df.select_dtypes(include=['number']).columns
        if len(numerical_cols) > 0:
            print("\nNumerical Features:")
            print(df[numerical_cols].describe())
        else:
            print("\nNo Numerical Features")
    
        # Analyze Categorical Features
        categorical_cols = df.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            print(f"\nCategorical Feature: {col}")
            print(df[col].value_counts())
        print("-" * 50)

### Analyze a single dataframe

In [47]:
# Analyze features from a single dataframe
def analyzeDFFeatures(df,df_name):
    print(f"DataFrame: {df_name}\n")
    
    # Analyze Numerical Features
    numerical_cols = df.select_dtypes(include=['number']).columns
    if len(numerical_cols) > 0:
        print("\nNumerical Features:")
        print(df[numerical_cols].describe())
    else:
        print("\nNo Numerical Features")

    # Analyze Categorical Features
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        print(f"\nCategorical Feature: {col}")
        print(df[col].value_counts())
    print("-" * 70)

## Find outliers

In [None]:
# Function to explore a single DataFrame
def exploreDataframe(df, df_name):
    if df_name == 'df_telemetry':
        # Check for outliers in telemetry data using IQR
        numerical_cols = ['volt', 'rotate', 'pressure', 'vibration']
        for col in numerical_cols:
          Q1 = df[col].quantile(0.25)
          Q3 = df[col].quantile(0.75)
          IQR = Q3 - Q1
          lower_bound = Q1 - 1.5 * IQR
          upper_bound = Q3 + 1.5 * IQR
          outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
          print(f"\nPotential outliers in {col}:\n", outliers.shape[0])
    else:
        # For other dataframes, print a message indicating the need for domain knowledge or visualizations
        print("\nChecking for outliers in other dataframes requires domain knowledge or visualizations.")

# Data Pre-processing

## Telemetry