# Step 1: getting and transforming the data
### Reading the data

In [35]:
import pandas as pd

# load the data
df = pd.read_csv('sensor.csv', parse_dates=['timestamp'])


### Split the data into training and testing and validation sets

In [36]:

# Split the dataset for training, validation, and testing
df_train = df[df['timestamp'] < '2018-07-01']
df_valid = df[(df['timestamp'] >= '2018-07-01') & (df['timestamp'] < '2018-08-01')]
df_test = df[df['timestamp'] >= '2018-08-01']


### write the data

In [38]:
# save these three parts to separate CSV files:
df_train.to_csv('train.csv', index=False)
df_valid.to_csv('valid.csv', index=False)
df_test.to_csv('test.csv', index=False)

------------

-----

# Step 2: create the model and the drawer

### Given this is anomaly detection on time-series data, one possible method is to use an Isolation Forest. This is an unsupervised learning algorithm that works well for anomaly detection.

### Loading the training part

In [53]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
df_train = pd.read_csv('train.csv', parse_dates=['timestamp'])

# Drop the columns we won't use
df_train.drop(['Unnamed: 0', 'timestamp', 'machine_status'], axis=1, inplace=True)

# Fill any NaN values with the mean
df_train.fillna(df_train.mean(), inplace=True)




### data-transformations

In [54]:
print(df_train.var())

sensor_00         0.167686
sensor_01         6.860247
sensor_02        10.313073
sensor_03         4.790412
sensor_04     19691.077418
sensor_05       365.869523
sensor_06         4.568557
sensor_07         5.175857
sensor_08         4.069469
sensor_09         4.671920
sensor_10       146.821045
sensor_11       146.540876
sensor_12       100.854835
sensor_13        20.465962
sensor_14     19273.190478
sensor_15              NaN
sensor_16     24060.658960
sensor_17     25932.878453
sensor_18         0.905067
sensor_19     59759.872045
sensor_20     15597.601355
sensor_21     77427.332346
sensor_22     33563.133204
sensor_23    109951.805812
sensor_24     49951.411656
sensor_25     72442.435995
sensor_26     81863.037311
sensor_27     29013.023414
sensor_28    139018.796364
sensor_29     82985.104657
sensor_30     53066.943340
sensor_31    122393.236814
sensor_32     88870.048974
sensor_33     36159.822098
sensor_34      7439.922658
sensor_35     19362.868357
sensor_36     95662.522637
s

In [55]:
print(df_train.isna().sum())


sensor_00         0
sensor_01         0
sensor_02         0
sensor_03         0
sensor_04         0
sensor_05         0
sensor_06         0
sensor_07         0
sensor_08         0
sensor_09         0
sensor_10         0
sensor_11         0
sensor_12         0
sensor_13         0
sensor_14         0
sensor_15    131040
sensor_16         0
sensor_17         0
sensor_18         0
sensor_19         0
sensor_20         0
sensor_21         0
sensor_22         0
sensor_23         0
sensor_24         0
sensor_25         0
sensor_26         0
sensor_27         0
sensor_28         0
sensor_29         0
sensor_30         0
sensor_31         0
sensor_32         0
sensor_33         0
sensor_34         0
sensor_35         0
sensor_36         0
sensor_37         0
sensor_38         0
sensor_39         0
sensor_40         0
sensor_41         0
sensor_42         0
sensor_43         0
sensor_44         0
sensor_45         0
sensor_46         0
sensor_47         0
sensor_48         0
sensor_49         0


In [56]:
# Drop 'sensor_15' column
df_train.drop('sensor_15', axis=1, inplace=True)

# Fill any NaN values with the mean
df_train.fillna(df_train.mean(), inplace=True)

# Then, check again if any NaN values still exist
print(df_train.isna().sum())

sensor_00    0
sensor_01    0
sensor_02    0
sensor_03    0
sensor_04    0
sensor_05    0
sensor_06    0
sensor_07    0
sensor_08    0
sensor_09    0
sensor_10    0
sensor_11    0
sensor_12    0
sensor_13    0
sensor_14    0
sensor_16    0
sensor_17    0
sensor_18    0
sensor_19    0
sensor_20    0
sensor_21    0
sensor_22    0
sensor_23    0
sensor_24    0
sensor_25    0
sensor_26    0
sensor_27    0
sensor_28    0
sensor_29    0
sensor_30    0
sensor_31    0
sensor_32    0
sensor_33    0
sensor_34    0
sensor_35    0
sensor_36    0
sensor_37    0
sensor_38    0
sensor_39    0
sensor_40    0
sensor_41    0
sensor_42    0
sensor_43    0
sensor_44    0
sensor_45    0
sensor_46    0
sensor_47    0
sensor_48    0
sensor_49    0
sensor_50    0
sensor_51    0
dtype: int64


### Train the model

In [57]:
# Scale the features
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_train), columns=df_train.columns)

# Define the model
model = IsolationForest(contamination=0.05)

# Fit the model
model.fit(df_scaled)

# Apply the trained model to the data
scores = model.decision_function(df_scaled)




###  save the model and the scaler for future use:

In [58]:
import joblib

# Save the model and the scaler
joblib.dump(model, 'model.joblib')
joblib.dump(scaler, 'scaler.joblib')


['scaler.joblib']