# Step 1: getting and transforming the data
### Reading the data

In [None]:
import pandas as pd

# load the data
df = pd.read_csv('sensor.csv', parse_dates=['timestamp'])


### Split the data into training and testing and validation sets

In [None]:

# Split the dataset for training, validation, and testing
df_train = df[df['timestamp'] < '2018-07-01']
df_valid = df[(df['timestamp'] >= '2018-07-01') & (df['timestamp'] < '2018-08-01')]
df_test = df[df['timestamp'] >= '2018-08-01']


### write the data

In [None]:
# save these three parts to separate CSV files:
df_train.to_csv('train.csv', index=False)
df_valid.to_csv('valid.csv', index=False)
df_test.to_csv('test.csv', index=False)

------------

-----

# Step 2: create the model and the drawer

### Given this is anomaly detection on time-series data, one possible method is to use an Isolation Forest. This is an unsupervised learning algorithm that works well for anomaly detection.

### Loading the training part

In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
df_train = pd.read_csv('train.csv', parse_dates=['timestamp'])

# Drop the columns we won't use
df_train.drop(['Unnamed: 0', 'timestamp', 'machine_status'], axis=1, inplace=True)

# Fill any NaN values with the mean
df_train.fillna(df_train.mean(), inplace=True)

### data-transformations

In [None]:
print(df_train.var())

In [None]:
print(df_train.isna().sum())


In [None]:
# Drop 'sensor_15' column
df_train.drop('sensor_15', axis=1, inplace=True)

# Fill any NaN values with the mean
df_train.fillna(df_train.mean(), inplace=True)

# Then, check again if any NaN values still exist
print(df_train.isna().sum())

### Train the model

In [None]:
# Scale the features
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_train), columns=df_train.columns)

# Define the model
model = IsolationForest(contamination=0.05)

# Fit the model
model.fit(df_scaled)

# Apply the trained model to the data
scores = model.decision_function(df_scaled)


###  save the model and the scaler for future use:

In [None]:
import joblib

# Save the model and the scaler
joblib.dump(model, 'model.joblib')
joblib.dump(scaler, 'scaler.joblib')


------

-----

In [None]:
import pandas as pd
import joblib
import os
import logging
import json
import time
import matplotlib.pyplot as plt
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from sklearn.preprocessing import StandardScaler


class DataHandler(FileSystemEventHandler):
    '''Handles new data files as they appear in the input directory.'''

    def __init__(self, config):
        '''Initializes the DataHandler with the given configuration.'''
    
        self.config = config
        self.model = joblib.load(config['model_path'])
        self.scaler = joblib.load(config['scaler_path'])
        self.setup_logging()

    def setup_logging(self):
        '''Sets up logging to write to the log file.'''
        logging.basicConfig(filename=self.config['log_path'], level=logging.INFO)
        logging.info('Application started.')

    def on_created(self, event):
        '''Called when a new file is created in the input directory.'''
        filename = event.src_path
        logging.info(f'New file detected: {filename}')

        try:
            data = pd.read_csv(filename)
            transformed_data = self.transform_data(data)
            predictions = self.model.predict(transformed_data)
            enriched_predictions = transformed_data.copy()
            enriched_predictions['prediction'] = predictions
            enriched_predictions.to_csv(os.path.join(self.config['output_directory'], os.path.basename(filename)))

            for sensor in self.config['sensors_to_draw']:
                self.plot_sensor(enriched_predictions, sensor)

            logging.info('Processing complete.')
        except Exception as e:
            logging.error(f'Error processing file: {e}')

    def transform_data(self, data):
        '''Transforms the data into a format that can be used by the model.'''
        
        # Apply transformations here. This is just an example.
        data = data.fillna(data.mean())
        data = pd.DataFrame(self.scaler.transform(data), columns=data.columns)
        return data

    def plot_sensor(self, data, sensor):
        '''Plots the given sensor data and saves it to a file.'''
        fig, ax = plt.subplots()
        data[sensor].plot(ax=ax)
        fig.savefig(os.path.join(self.config['image_directory'], f'{sensor}.png'))


def load_config(config_file):
    '''Loads the configuration from the given JSON file.'''
    with open(config_file) as f:
        return json.load(f)


def main():
    config = load_config('config.json')
    event_handler = DataHandler(config)

    observer = Observer()
    observer.schedule(event_handler, config['input_directory'], recursive=False)
    observer.start()

    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()

    observer.join()


if __name__ == "__main__":
    main()
