# Libraries

In [3]:
import pandas as pd

# Data

In [7]:
%run "variables.py"

In [5]:
def getData(file_names):
    #Import and print data  
    try:
        # Loop through the files and read them
        for file_name, df_name in file_names:
            df = pd.read_csv(f"{DATA_DIR}/{file_name}")
            print(f"{df_name} DataFrame \n")
            display(df.head())
            
    except FileNotFoundError:
        print("One or more CSV files not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

## Initial dataframes

# Data exploration

In [11]:
# Examine Data Shape and Data Types
dataframes = [df_telemetry, df_errors, df_machines, df_maint, df_failures]
dataframe_names = ['df_telemetry', 'df_errors', 'df_machines', 'df_maint', 'df_failures']

for i, df in enumerate(dataframes):
    print(f"DataFrame: {dataframe_names[i]}")
    print("Shape:", df.shape)
    print("Data Types:\n", df.dtypes)
    print("Missing Values:\n", df.isnull().sum())

    # Analyze Numerical Features
    numerical_cols = df.select_dtypes(include=['number']).columns
    if len(numerical_cols) > 0:
        print("\nNumerical Features:")
        print(df[numerical_cols].describe())
    else:
        print("\nNo Numerical Features")

    # Analyze Categorical Features
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        print(f"\nCategorical Feature: {col}")
        print(df[col].value_counts())
    print("-" * 50)

NameError: name 'df_telemetry' is not defined

# Data Pre-processing

In [27]:
# Format date & time. Sort based on date for better readability
tables = [telemetry_df, maint_df, failures_df, errors_df]
for df in tables:
    df["datetime"] = pd.to_datetime(df["datetime"], format="%Y-%m-%d %H:%M:%S")
    df.sort_values(["datetime", "machineID"], inplace=True, ignore_index=True)

## Telemetry

In [28]:
print(f"Shape of the Telemetry Records: {telemetry_df.shape}")
print("\n")
telemetry_df.head()

Shape of the Telemetry Records: (876100, 6)




Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686
1,2015-01-01 06:00:00,2,176.558913,424.624162,76.005332,43.767049
2,2015-01-01 06:00:00,3,185.482043,461.211137,87.453199,28.216864
3,2015-01-01 06:00:00,4,169.710847,463.646727,95.929877,38.400372
4,2015-01-01 06:00:00,5,165.082899,452.283576,84.591722,40.298803


In [None]:
import matplotlib.pyplot as plt

# Time Series Analysis for df_telemetry
for machine_id in df_telemetry['machineID'].unique():
    machine_data = df_telemetry[df_telemetry['machineID'] == machine_id]

    # Convert 'datetime' to datetime objects
    machine_data['datetime'] = pd.to_datetime(machine_data['datetime'])

    # Set 'datetime' as index
    machine_data = machine_data.set_index('datetime')

    plt.figure(figsize=(12, 8))

    # Plot each telemetry feature
    for column in ['volt', 'rotate', 'pressure', 'vibration']:
        plt.plot(machine_data.index, machine_data[column], label=column)

        # Calculate and plot rolling mean (e.g., window of 24 hours)
        rolling_mean = machine_data[column].rolling(window=24).mean()
        plt.plot(machine_data.index, rolling_mean, label=f"{column} rolling mean (24h)", linestyle='--')


    plt.title(f"Telemetry Data for Machine {machine_id}")
    plt.xlabel("Datetime")
    plt.ylabel("Value")
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Time Series Analysis for df_telemetry using Plotly - All features on the same plot
for machine_id in df_telemetry['machineID'].unique()[:3]: # Only plot for the first 3 machines
    machine_data = df_telemetry[df_telemetry['machineID'] == machine_id].copy()
    machine_data.loc[:, 'datetime'] = pd.to_datetime(machine_data['datetime'])

    # Set 'datetime' as index
    machine_data = machine_data.set_index('datetime')

    # Create a single figure for all features
    fig = go.Figure()

    features = ['volt', 'rotate', 'pressure', 'vibration']
    colors = ['blue', 'red', 'green', 'purple'] # Define colors for each feature

    for i, column in enumerate(features):
        # Add trace for the original data
        fig.add_trace(go.Scattergl(x=machine_data.index, y=machine_data[column], mode='lines', name=column, line=dict(color=colors[i])))

        # Calculate and add trace for rolling mean (e.g., window of 24 hours)
        rolling_mean = machine_data[column].rolling(window=24).mean()
        fig.add_trace(go.Scattergl(x=machine_data.index, y=rolling_mean, mode='lines', name=f"{column} rolling mean (24h)", line=dict(dash='dash', color=colors[i])))

    # Update layout
    fig.update_layout(
        title_text=f"Telemetry Data for Machine {machine_id}",
        height=600,
        xaxis_title="Datetime",
        yaxis_title="Value",
        hovermode='x unified' # Unify hover for better comparison
    )

    # Update x-axis to show datetime with hour and add range slider
    fig.update_xaxes(
        tickformat="%y:%m:%d | %H:%M",
        rangeslider_visible=True
    )

    fig.show()