In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
import joblib 
import os
import logging
import json
import time
import matplotlib.pyplot as plt
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler  

In [None]:
#data loading 
df = pd.read_csv('sensor.csv', parse_dates=['timestamp'])

#Split data
train_df = df[df['timestamp'] < '2018-07-01']
valid_df = df[(df['timestamp'] >= '2018-07-01') & (df['timestamp'] < '2018-08-01')]
test_df = df[df['timestamp'] >= '2018-08-01'] 

In [None]:
#save to csv file:
train_df.to_csv('train.csv', index=False)
valid_df.to_csv('valid.csv', index=False)
test_df.to_csv('test.csv', index=False)

In [None]:
#inspect training data
df_train = pd.read_csv('train.csv', parse_dates=['timestamp'])
df_train

In [None]:
# print(df_train.isna().sum())
# print(df_train.var())

In [None]:
#missing values
percentage_missing = df_train.isnull().sum().sort_values(ascending=False)/len(df)*100
percentage_missing.head() # show 5 largest missing %

In [None]:
df_test = pd.read_csv('test.csv', parse_dates=['timestamp'])
percentage_missing = df_test.isnull().sum().sort_values(ascending=False)/len(df)*100
percentage_missing.head() # show 5 largest missing %

In [None]:
#drop low quality columns
df_train.drop(['Unnamed: 0', 'timestamp', 'machine_status'], axis=1, inplace=True)

#fill NaN values
df_train.fillna(df_train.mean(), inplace=True)

#drop 'sensor_15' column
df_train.drop('sensor_15', axis=1, inplace=True)

In [None]:
df_train

In [None]:
#feature scaling
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_train), columns=df_train.columns)

#model
model = IsolationForest(contamination=0.05)

#model fitting
model.fit(df_scaled)

#apply trained model to data
scores = model.decision_function(df_scaled)

#saving model
joblib.dump(model, 'model.joblib')
joblib.dump(scaler, 'scaler.joblib')

In [None]:
class FileProcessor:
    def __init__(self, config_path):
        #Load configuration
        with open(config_path, 'r') as file:
            self.config = json.load(file)
        
        self.input_directory = self.config['input_directory']
        self.output_directory = self.config['output_directory']
        self.image_directory = self.config['image_directory']
        self.model_path = self.config['model_path']
        self.scaler_path = self.config['scaler_path']
        self.sensors_to_draw = self.config['sensors_to_draw']
        self.check_interval = self.config['check_interval']
        
        #Load model and scaler
        self.model = joblib.load(self.model_path)
        self.scaler = joblib.load(self.scaler_path)
        
        #set up logging
        logging.basicConfig(filename='file_processor.log', level=logging.INFO,
                            format='%(asctime)s %(levelname)s %(message)s')  # Add format parameter
    
    def process_files(self):
        while True:
            try:
                files = [f for f in os.listdir(self.input_directory) if os.path.isfile(os.path.join(self.input_directory, f))]
                for file in files:
                    try:
                        logging.info(f"{time.strftime('%Y-%m-%d %H:%M:%S')}  Found new data file: {file}")
                        self.process_file(file)
                    except Exception as e:
                        logging.error(f"{time.strftime('%Y-%m-%d %H:%M:%S')}  Error processing file {file}: {str(e)}")
                
                time.sleep(self.check_interval)
            except Exception as e:
                logging.error(f"{time.strftime('%Y-%m-%d %H:%M:%S')}  Error processing files: {str(e)}")
    
    def process_file(self, file):
        try:
            #Load data
            file_path = os.path.join(self.input_directory, file)
            df = pd.read_csv(file_path, parse_dates=['timestamp'])
            df['timestamp'] = pd.to_datetime(df['timestamp'])  # Convert 'timestamp' to datetime
            df.set_index('timestamp', inplace=True)  # Set 'timestamp' as index
            logging.info(f"{time.strftime('%Y-%m-%d %H:%M:%S')}  Loaded the file: {file}")
            
            #data cleaning and preprocessing
            df.drop(['Unnamed: 0', 'machine_status'], axis=1, inplace=True, errors='ignore')
            df.fillna(df.mean(), inplace=True)
            if 'sensor_15' in df.columns:
                df.drop('sensor_15', axis=1, inplace=True)
            
            df_scaled = pd.DataFrame(self.scaler.transform(df), columns=df.columns)
            logging.info(f"{time.strftime('%Y-%m-%d %H:%M:%S')}  Received transformed data")
            
            #Predictions
            predictions = self.model.predict(df_scaled)
            df['anomaly'] = predictions
            logging.info(f"{time.strftime('%Y-%m-%d %H:%M:%S')}  Received predictions")
            
            #Save predictions
            output_file_path = os.path.join(self.output_directory, file)
            df.to_csv(output_file_path, index=False)
            logging.info(f"{time.strftime('%Y-%m-%d %H:%M:%S')}  Saving predictions to {output_file_path}")
            
            #save images
            for sensor in self.sensors_to_draw:
                if sensor in df.columns:
                    plt.figure()
                    plt.plot(df.index.to_numpy(), df[sensor].to_numpy())  #Convert DataFrame columns to numpy arrays
                    plt.title(f"{sensor} over time")
                    image_path = os.path.join(self.image_directory, f"{file}-{sensor}.png")
                    plt.savefig(image_path)
                    plt.close()
                    logging.info(f"{time.strftime('%Y-%m-%d %H:%M:%S')}  Saving image {image_path}")


            
            #remove the original file
            os.remove(file_path)
            logging.info(f"{time.strftime('%Y-%m-%d %H:%M:%S')}  Removed original file {file}")
            logging.info(f"{time.strftime('%Y-%m-%d %H:%M:%S')}  Resuming listening")
        except Exception as e:
            logging.error(f"{time.strftime('%Y-%m-%d %H:%M:%S')}  Error processing file {file}: {str(e)}")


#Example usage
if __name__ == "__main__":
    processor = FileProcessor('application.json')
    processor.process_files()
