## Azure Maintenence Prediction

https://www.kaggle.com/datasets/arnabbiswas1/microsoft-azure-predictive-maintenance

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import kurtosis
import pywt
from scipy.fft import fft
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import resample

telemetry_df = pd.read_csv('telemetry.csv')
errors_df = pd.read_csv('errors.csv')

errors_df.head()

Unnamed: 0,datetime,machineID,errorID
0,2015-01-03 07:00:00,1,error1
1,2015-01-03 20:00:00,1,error3
2,2015-01-04 06:00:00,1,error5
3,2015-01-10 15:00:00,1,error4
4,2015-01-22 10:00:00,1,error4


In [2]:
errors_df.groupby('machineID').count().sort_values(ascending=False, by='errorID').head(1)

Unnamed: 0_level_0,datetime,errorID
machineID,Unnamed: 1_level_1,Unnamed: 2_level_1
22,60,60


In [3]:
telemetry_df[telemetry_df['machineID'] == 22].groupby('machineID').count()

Unnamed: 0_level_0,datetime,volt,rotate,pressure,vibration
machineID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
22,8761,8761,8761,8761,8761


In [4]:
errors_df['datetime'] = errors_df['datetime'].astype('datetime64[ns]')
telemetry_df['datetime'] = telemetry_df['datetime'].astype('datetime64[ns]')

# Selecionamos a máquina com mais ocorrências de erros
mID = 22

errors_df = errors_df[errors_df['machineID'] == mID]
telemetry_df = telemetry_df[telemetry_df['machineID'] == mID]

errors_df.drop(['machineID'], axis=1, inplace=True)
telemetry_df.drop(['machineID'], axis=1, inplace=True)

# Preprocess - qualquer erro vira 1
errors_df['error'] = errors_df['errorID'].apply(lambda x: 1)
errors_df = errors_df.drop('errorID', axis=1)

# Merge telemetry com errors pela data, e preenche com 0 as ocorrências sem erro
merged_df = telemetry_df.merge(errors_df, on=['datetime'], how='left')
merged_df['error'] = merged_df['error'].fillna(0)

In [7]:
# Janela de tempo para extrair as características
window_size = 3  
data_windows = []
labels = []

machine_data = merged_df

In [8]:
# Função para extração de características
def extract_features(window):
    features = {}
    if len(window) > 0:
        features['mean'] = window.mean()
        features['median'] = window.median()
        features['std'] = window.std()
        features['kurtosis'] = kurtosis(window)

        # Wavelet transform
        coeffs = pywt.wavedec(window, 'db1', level=1)
        features['wavelet'] = np.mean(coeffs[0])  

        # Fourier transform
        #window_np = np.array(window)
        #fourier = fft(window_np)
        #features['fourier_mean'] = np.mean(np.abs(fourier))
        #features['fourier_std'] = np.std(np.abs(fourier))
    return features

In [9]:
# Checando se as 15 horas anteriores são normais
machine_data['condition_met'] = 0

for i in range(15, len(machine_data)):
    if machine_data['error'].iloc[i-15:i].sum() == 0:
        machine_data.loc[i, 'condition_met'] = 1

In [10]:
# Sample instâncias normais 
num = 100
np.random.seed(42)
normal_indices = machine_data[(machine_data['error'] == 0) & (machine_data['condition_met'] == 1)].index
sampled_normal_indices = np.random.choice(normal_indices, size=min(num, len(normal_indices)), replace=False)
sampled_indices = sorted(list(sampled_normal_indices) + list(machine_data[machine_data['error'] == 1].index))


In [11]:
for i in sampled_indices:
    if i >= window_size and i < len(machine_data):
        window = machine_data.iloc[i-window_size:i]
        features = {}
        for column in ['volt', 'rotate', 'pressure', 'vibration']:
            column_features = extract_features(window[column])
            for key in column_features:
                features[f'{column}_{key}'] = column_features[key]
        label = machine_data.iloc[i]['error']
        data_windows.append(features)
        labels.append(label)

# Cria df com as características
feature_df = pd.DataFrame(data_windows)
feature_df['label'] = labels

In [13]:
# Downsample a classe normal - para evitar que o algoritmo considere tudo como normal
sample = 100
normal_df = feature_df[feature_df['label'] == 0]
error_df = feature_df[feature_df['label'] == 1]

normal_df_downsampled = resample(normal_df, replace=False, n_samples=sample, random_state=42)
balanced_df = pd.concat([normal_df_downsampled, error_df])

# Treino e teste
X = balanced_df.drop('label', axis=1)
y = balanced_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Random Forest model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Print metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[26  4]
 [ 7 11]]
              precision    recall  f1-score   support

         0.0       0.79      0.87      0.83        30
         1.0       0.73      0.61      0.67        18

    accuracy                           0.77        48
   macro avg       0.76      0.74      0.75        48
weighted avg       0.77      0.77      0.77        48



In [22]:
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical

# Criar janelas de tempo para LSTM
def create_windows(df, samples_per_window, indices):
    X = []
    y = []
    for i in indices:
        if i >= samples_per_window and i < len(df):
            window = df.iloc[i-samples_per_window:i]
            X.append(window[['volt', 'rotate', 'pressure', 'vibration']].values)
            y.append(df.iloc[i]['error'])
    return np.array(X), np.array(y)

X, y = create_windows(machine_data, window_size, sampled_indices)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

# Teino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.3, random_state=42, stratify=y_categorical)

# LSTM model
model = Sequential()
model.add(LSTM(100, input_shape=(window_size, 4)))
model.add(Dense(y_categorical.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, validation_split=0.1)

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

print(confusion_matrix(y_true_classes, y_pred_classes))
print(classification_report(y_true_classes, y_pred_classes))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[[29  1]
 [12  6]]
              precision    recall  f1-score   support

           0       0.71      0.97      0.82        30
           1       0.86      0.33      0.48        18

    accuracy                           0.73        48
   macro avg       0.78      0.65      0.65        48
weighted avg       0.76      0.73      0.69        48



## HAR - Human Activity Recognition

https://www.kaggle.com/datasets/uciml/human-activity-recognition-with-smartphones

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_csv('time_series_data_human_activities.csv')

# Convertendo para segundos
df['time'] = df['timestamp'] / 1e9

# Treino e teste
X = df[['x-axis', 'y-axis', 'z-axis']]
y = df['activity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest

#OBS.: Demora para rodar (2 min aqui no meu PC)! Paciência :)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')

              precision    recall  f1-score   support

  Downstairs       0.32      0.13      0.19     20129
     Jogging       0.70      0.70      0.70     65138
     Sitting       0.98      0.98      0.98     12099
    Standing       0.83      0.87      0.85      9591
    Upstairs       0.35      0.15      0.21     24350
     Walking       0.60      0.77      0.67     83418

    accuracy                           0.64    214725
   macro avg       0.63      0.60      0.60    214725
weighted avg       0.60      0.64      0.61    214725

Accuracy: 0.64


In [5]:
from scipy.signal import welch
import pywt

window_size = 1  # em segundos
overlap = 0.5  # 50% overlap

# Sampling rate
sampling_rate = int(1 / (df['time'].iloc[1] - df['time'].iloc[0]))
samples_per_window = int(window_size * sampling_rate)
step_size = int(samples_per_window * (1 - overlap))

# Extração de características
def extract_features(window):
    features = {}
    for axis in ['x-axis', 'y-axis', 'z-axis']:
        # Time domain features
        features[f'{axis}_mean'] = np.mean(window[axis])
        features[f'{axis}_std'] = np.std(window[axis])
        features[f'{axis}_max'] = np.max(window[axis])
        features[f'{axis}_min'] = np.min(window[axis])
        
        # Frequency domain features using Welch's method
        #freqs, psd = welch(window[axis], fs=sampling_rate)
        #features[f'{axis}_psd_mean'] = np.mean(psd)
        #features[f'{axis}_psd_std'] = np.std(psd)
        
        # Wavelet transform features
        coeffs = pywt.wavedec(window[axis], 'db1', level=2)
        for i, coeff in enumerate(coeffs):
            features[f'{axis}_wavelet_{i}_mean'] = np.mean(coeff)
            features[f'{axis}_wavelet_{i}_std'] = np.std(coeff)
    
    return features

# Preparando o dataset
X = []
y = []

for start in range(0, len(df) - samples_per_window, step_size):
    end = start + samples_per_window
    window = df.iloc[start:end]
    if len(window) == samples_per_window:
        features = extract_features(window)
        X.append(features)
        y.append(window['activity'].mode()[0])  

X = pd.DataFrame(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest
# Tempo no meu PC: 1:40
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')


              precision    recall  f1-score   support

  Downstairs       0.82      0.63      0.71      2027
     Jogging       0.96      0.99      0.97      6429
     Sitting       1.00      0.98      0.99      1163
    Standing       0.98      0.98      0.98       963
    Upstairs       0.81      0.67      0.73      2514
     Walking       0.89      0.98      0.93      8377

    accuracy                           0.91     21473
   macro avg       0.91      0.87      0.89     21473
weighted avg       0.91      0.91      0.91     21473

Accuracy: 0.91


In [7]:
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical

# Criar janelas de tempo
def create_windows(df, samples_per_window, step_size):
    X = []
    y = []
    for start in range(0, len(df) - samples_per_window, step_size):
        end = start + samples_per_window
        window = df.iloc[start:end]
        if len(window) == samples_per_window:
            X.append(window[['x-axis', 'y-axis', 'z-axis']].values)
            y.append(window['activity'].mode()[0]) 
    return np.array(X), np.array(y)

X, y = create_windows(df, samples_per_window, step_size)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

# Treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

# LSTM model
# Tempo no meu PC: 2:05
model = Sequential()
model.add(LSTM(100, input_shape=(samples_per_window, 3)))
model.add(Dense(y_categorical.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

print(classification_report(y_true_classes, y_pred_classes))
print(f'Accuracy: {accuracy_score(y_true_classes, y_pred_classes):.2f}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      2027
           1       0.98      0.99      0.99      6429
           2       0.99      0.98      0.98      1163
           3       0.97      0.98      0.98       963
           4       0.86      0.86      0.86      2514
           5       0.98      0.97      0.98      8377

    accuracy                           0.96     21473
   macro avg       0.94      0.94      0.94     21473
weighted avg       0.96      0.96      0.96     21473

Accuracy: 0.96
