# Installation

In [1]:
pip install boto3

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install --upgrade tensorflow

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install keras

Note: you may need to restart the kernel to use updated packages.


In [5]:
%pip install keras


Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install tensorflow h5py

Note: you may need to restart the kernel to use updated packages.


# Imports

In [8]:
import pandas as pd
import numpy as np
import boto3
from io import StringIO
from io import BytesIO

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras.models import Model
from keras.layers import Conv1D, LSTM, Dense, Flatten, Dropout, BatchNormalization, Input, Reshape
from sklearn.preprocessing import MinMaxScaler
from keras.optimizers import Adam

2025-03-21 06:02:33.360374: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-21 06:02:33.364531: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-21 06:02:33.376640: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742536953.396940   12946 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742536953.403475   12946 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742536953.419172   12946 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

# Reading the cleaned data from S3

In [9]:
s3 = boto3.client('s3')
bucket_name = "459-team4-project"
object_key = "clean-data/dataset_merged.csv"

obj = s3.get_object(Bucket=bucket_name, Key=object_key)
df = pd.read_csv(obj['Body'])

print(df.head())

       LCLid         day  energy_median  energy_mean  energy_max  \
0  MAC004707  2013-01-01         0.1775     0.282438       1.195   
1  MAC004707  2013-01-02         0.2310     0.316854       1.224   
2  MAC004707  2013-01-03         0.2190     0.284896       0.914   
3  MAC004707  2013-01-04         0.3255     0.339187       0.673   
4  MAC004707  2013-01-05         0.2940     0.298854       0.660   

   energy_count  energy_std  energy_sum  energy_min stdorToU    Acorn  \
0            48    0.211532      13.557       0.113      Std  ACORN-E   
1            48    0.220660      15.209       0.113      Std  ACORN-E   
2            48    0.163766      13.675       0.119      Std  ACORN-E   
3            48    0.184414      16.281       0.100      Std  ACORN-E   
4            48    0.165935      14.345       0.096      Std  ACORN-E   

  Acorn_grouped      file  
0      Affluent  block_22  
1      Affluent  block_22  
2      Affluent  block_22  
3      Affluent  block_22  
4      Afflu

In [10]:
# Check for null values
df.isnull().sum()

LCLid            0
day              0
energy_median    0
energy_mean      0
energy_max       0
energy_count     0
energy_std       0
energy_sum       0
energy_min       0
stdorToU         0
Acorn            0
Acorn_grouped    0
file             0
dtype: int64

In [11]:
# Check for total rows count
# About 23k plus extra rows were removed as compared to the one on Google Drive
df.count()

LCLid            1915031
day              1915031
energy_median    1915031
energy_mean      1915031
energy_max       1915031
energy_count     1915031
energy_std       1915031
energy_sum       1915031
energy_min       1915031
stdorToU         1915031
Acorn            1915031
Acorn_grouped    1915031
file             1915031
dtype: int64

In [12]:
# Checking if a "LCLid" exist
df[df['LCLid'] == 'mac004553']

Unnamed: 0,LCLid,day,energy_median,energy_mean,energy_max,energy_count,energy_std,energy_sum,energy_min,stdorToU,Acorn,Acorn_grouped,file


# Using EDA (3Q + 3 IQR) to remove extreme outliers (as it affects model training and k means clustering)

In [13]:
# List of the unique acorn values
unique_acorn_values = [
    "ACORN-",
    "ACORN-A",
    "ACORN-B",
    "ACORN-C",
    "ACORN-D",
    "ACORN-E",
    "ACORN-F",
    "ACORN-G",
    "ACORN-H",
    "ACORN-I",
    "ACORN-J",
    "ACORN-K",
    "ACORN-L",
    "ACORN-M",
    "ACORN-N",
    "ACORN-O",
    "ACORN-P",
    "ACORN-Q",
    "ACORN-U"
]

# Dictionary to store outliers for each acorn group
outliers_dict = {}

for acorn in unique_acorn_values:
    # Filter df for the current acorn group
    acorn_data = df[df['Acorn'] == acorn]

    # Compute the average energy_sum per LCLid
    acorn_avg_energy = acorn_data.groupby('LCLid')['energy_sum'].mean()

    # Compute Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = np.percentile(acorn_avg_energy, 25)
    Q3 = np.percentile(acorn_avg_energy, 75)

    # Compute IQR (Interquartile Range)
    IQR = Q3 - Q1

    # Compute upper bound for outliers (Q3 + 3 * IQR)
    upper_bound = Q3 + 3 * IQR

    # Identify outlier LCLid values
    outliers = acorn_avg_energy[acorn_avg_energy > upper_bound].index.tolist()

    # Store results
    outliers_dict[acorn] = outliers

# Print outlier LCLid for each acorn group
for acorn, outliers in outliers_dict.items():
    print(f"acorn Group {acorn} - Outlier LCLid(s): {outliers}")

acorn Group ACORN- - Outlier LCLid(s): []
acorn Group ACORN-A - Outlier LCLid(s): ['MAC004319']
acorn Group ACORN-B - Outlier LCLid(s): []
acorn Group ACORN-C - Outlier LCLid(s): []
acorn Group ACORN-D - Outlier LCLid(s): []
acorn Group ACORN-E - Outlier LCLid(s): []
acorn Group ACORN-F - Outlier LCLid(s): []
acorn Group ACORN-G - Outlier LCLid(s): []
acorn Group ACORN-H - Outlier LCLid(s): []
acorn Group ACORN-I - Outlier LCLid(s): []
acorn Group ACORN-J - Outlier LCLid(s): []
acorn Group ACORN-K - Outlier LCLid(s): []
acorn Group ACORN-L - Outlier LCLid(s): []
acorn Group ACORN-M - Outlier LCLid(s): []
acorn Group ACORN-N - Outlier LCLid(s): []
acorn Group ACORN-O - Outlier LCLid(s): []
acorn Group ACORN-P - Outlier LCLid(s): ['MAC005340', 'MAC005402']
acorn Group ACORN-Q - Outlier LCLid(s): ['MAC003594', 'MAC004395']
acorn Group ACORN-U - Outlier LCLid(s): []


In [14]:
extreme_outliers = []

for acorn, outliers in outliers_dict.items():
    if len(outliers) >= 1:
        for extreme_outlier_LCLid in outliers:
            extreme_outliers.append(extreme_outlier_LCLid)

print(extreme_outliers)

['MAC004319', 'MAC005340', 'MAC005402', 'MAC003594', 'MAC004395']


In [15]:
# Removing extreme outliers to form Dataset A
dataset_A = df[~df["LCLid"].isin(extreme_outliers)]

# K-means clustering with Dataset A, output is k_means_outliers

In [16]:
# Preprocessing: Normalize the data (assuming the columns you want to use for clustering)
features = ['energy_median', 'energy_mean', 'energy_max', 'energy_count', 'energy_std', 'energy_sum', 'energy_min']
scaler = StandardScaler()
dataset_A[features] = scaler.fit_transform(dataset_A[features])

# Define function to cluster data per LCLid
def cluster_per_LCLid(data):
    LCLid_clusters = {}
    k_means_outliers = []
    LCLid_outlier_percentages = {}

    # Loop through each unique LCLid
    for LCLid in data['LCLid'].unique():
        # Create a copy of the subset for each LCLid to avoid modification on a view
        LCLid_data = data[data['LCLid'] == LCLid].copy()

        # Apply KMeans clustering for each LCLid
        kmeans = KMeans(n_clusters=2)  # Cluster of 2 is chosen - 1 for winter months, and another for non-winter months
        LCLid_data['cluster'] = kmeans.fit_predict(LCLid_data[features])

        # Calculate the distance of each point to its assigned cluster center
        distances = kmeans.transform(LCLid_data[features])
        min_distances = np.min(distances, axis=1)

        # Set a threshold for identifying outliers (e.g., points that are farther than 1.5 times the average distance)
        threshold = 1.5 * np.mean(min_distances)

        # Flag points that are far from the centroids (outliers)
        outliers = LCLid_data[min_distances > threshold]

        # Calculate the percentage of outliers
        outlier_percentage = len(outliers) / len(LCLid_data) * 100

        # Record the percentage of outliers for the current LCLid
        LCLid_outlier_percentages[LCLid] = outlier_percentage

        # If more than 20% of the points are outliers, flag as fraud
        if outlier_percentage > 20:
            k_means_outliers.append(LCLid)

    return k_means_outliers, LCLid_outlier_percentages

# Apply the function
k_means_outliers, LCLid_outlier_percentages = cluster_per_LCLid(dataset_A)

# Print the results
print("Fraudulent LCLids:", k_means_outliers)
print("Percentage of Outliers for each LCLid:")
for LCLid, outlier_percentage in LCLid_outlier_percentages.items():
    print(f"LCLid {LCLid}: {outlier_percentage:.2f}% outliers")

Fraudulent LCLids: ['MAC000874', 'MAC002304', 'MAC003648', 'MAC004730', 'MAC004852', 'MAC002963', 'MAC004317', 'MAC000956', 'MAC003629', 'MAC001269', 'MAC000621', 'MAC003764', 'MAC005092', 'MAC005162', 'MAC004811', 'MAC005279', 'MAC001176', 'MAC004064', 'MAC005180', 'MAC001340', 'MAC000019', 'MAC004846', 'MAC004751', 'MAC002710', 'MAC004595', 'MAC004433', 'MAC000036', 'MAC000960', 'MAC001039', 'MAC003994', 'MAC004552', 'MAC002881', 'MAC000607', 'MAC005008', 'MAC005386', 'MAC003789', 'MAC004102', 'MAC005250', 'MAC001495', 'MAC004634', 'MAC002863', 'MAC000664', 'MAC000891', 'MAC000916', 'MAC004803', 'MAC000657', 'MAC000153', 'MAC002765', 'MAC002624', 'MAC004983', 'MAC003005', 'MAC002324', 'MAC002146', 'MAC002382', 'MAC001512', 'MAC004646', 'MAC000729', 'MAC003054', 'MAC003919', 'MAC005055', 'MAC003791', 'MAC002843', 'MAC002925', 'MAC002651', 'MAC002642', 'MAC002972', 'MAC005442', 'MAC005214', 'MAC004506', 'MAC002611', 'MAC001499', 'MAC000501', 'MAC003884', 'MAC003006', 'MAC004191', 'MAC0

# DBSCAN clustering with Dataset A, output is dbscan_outliers

In [17]:
# Preprocessing: Drop rows with NaN values in the specified features
features = ['energy_median', 'energy_mean', 'energy_max', 'energy_count', 'energy_std', 'energy_sum', 'energy_min']

# Aggregate data by LCLid (mean/median of daily usage data for each LCLid)
df_agg = dataset_A.groupby('LCLid')[features].agg(['mean', 'std', 'max', 'min']).reset_index()

# Flatten multi-level column names
df_agg.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in df_agg.columns]

# Standardize AFTER aggregation
scaler = StandardScaler()
df_agg.iloc[:, 1:] = scaler.fit_transform(df_agg.iloc[:, 1:])  # Exclude 'LCLid'

# Apply DBSCAN clustering across all LCLids
dbscan = DBSCAN(eps=7, min_samples=5)  # Tune eps and min_samples
df_agg['cluster'] = dbscan.fit_predict(df_agg.iloc[:, 1:])  # Exclude 'LCLid'

# Identify outliers (those labeled as -1 by DBSCAN)
dbscan_fraud_LCLid = df_agg[df_agg['cluster'] == -1]

# List of fraudulent LCLids, extract from dbscan_fraud_LCLid and store as python list
dbscan_outliers = dbscan_fraud_LCLid['LCLid_'].tolist()

print("Fraudulent LCLids:", dbscan_outliers)


Fraudulent LCLids: ['MAC000134', 'MAC000252', 'MAC000569', 'MAC000730', 'MAC001836', 'MAC002613', 'MAC002628', 'MAC002755', 'MAC002952', 'MAC003298', 'MAC003353', 'MAC003394', 'MAC003428', 'MAC003449', 'MAC004087', 'MAC004330', 'MAC004639', 'MAC004716', 'MAC004778', 'MAC005009', 'MAC005041', 'MAC005308', 'MAC005318', 'MAC005361']


# Step 7: Isolation forest with Dataset A, output is isolation_forest_outliers

In [18]:
# Select relevant features
features = ['energy_median', 'energy_mean', 'energy_max', 'energy_count', 'energy_std', 'energy_sum', 'energy_min']

# Ensure dataset_A contains both features and LCLid_ before splitting
df_combined = dataset_A.copy()  # Make a copy to avoid modifying the original DataFrame

# Split into train (80%) and test (20%)
train_data, test_data = train_test_split(df_combined, test_size=0.2, random_state=42, shuffle=True)

# Train Isolation Forest model
model = IsolationForest(contamination=0.05, random_state=42)
train_data['Anomaly'] = model.fit_predict(train_data[features])

# Predict anomalies on test data
test_data['Anomaly'] = model.predict(test_data[features])

# Mark anomalies (-1 indicates anomaly, 1 indicates normal)
train_data['Anomaly'] = train_data['Anomaly'].map({1: 'Normal', -1: 'Anomaly'})
test_data['Anomaly'] = test_data['Anomaly'].map({1: 'Normal', -1: 'Anomaly'})

# Set the threshold for anomaly percentage 30% (any LCLid with 30% or more rows that is flag as anomaly will be flag as fraudulent)
threshold = 0.30

# Count total rows per LCLid and anomalies per LCLid
anomaly_counts = test_data.groupby('LCLid').agg(
    total_rows=('LCLid', 'size'),
    anomalies=('Anomaly', lambda x: (x == 'Anomaly').sum())
)

# Calculate the percentage of anomalies for each LCLid
anomaly_counts['anomaly_percentage'] = anomaly_counts['anomalies'] / anomaly_counts['total_rows']

# Flag LCLids as fraudulent if the percentage of anomalies exceeds the threshold
isolation_forest_outliers = anomaly_counts[anomaly_counts['anomaly_percentage'] >= threshold].index.tolist()

# Display fraudulent LCLids
print(len(isolation_forest_outliers))

354


# Dataset B - Removing k_means_outliers, dbscan_outliers and  isolation_forest_outliers identified in earlier steps from Dataset A

In [19]:
# Combine all outliers into a single set for fast lookup
outliers_set = set(k_means_outliers + dbscan_outliers + isolation_forest_outliers)

# Drop rows from dataset_A where 'LCLid' is in the outliers set
dataset_B = dataset_A[~dataset_A['LCLid'].isin(outliers_set)]

# Display the filtered dataset
dataset_B.count()

LCLid            1699611
day              1699611
energy_median    1699611
energy_mean      1699611
energy_max       1699611
energy_count     1699611
energy_std       1699611
energy_sum       1699611
energy_min       1699611
stdorToU         1699611
Acorn            1699611
Acorn_grouped    1699611
file             1699611
dtype: int64

# Load trained Auto-Encoder model and test on dataset_A, output is autoencoder_outliers

In [20]:
# Function to load and preprocess data
def load_and_preprocess_data(df):
    # Normalize features
    scaler = MinMaxScaler()
    df[['energy_median', 'energy_mean', 'energy_max', 'energy_count', 'energy_std', 'energy_sum', 'energy_min']] = \
        scaler.fit_transform(df[['energy_median', 'energy_mean', 'energy_max', 'energy_count', 'energy_std', 'energy_sum', 'energy_min']])
   
    return df, scaler

# Function to create sequences for training
def create_sequences(df, sequence_length=30):
    sequences, lcl_ids = [], []
    grouped = df.groupby('LCLid')
    
    for lcl_id, group in grouped:
        group = group.sort_values('day')
        data = group[['energy_median', 'energy_mean', 'energy_max', 'energy_count', 'energy_std', 'energy_sum', 'energy_min']].values
        
        for i in range(len(data) - sequence_length):
            sequences.append(data[i:i + sequence_length])
            lcl_ids.append(lcl_id)
    
    return np.array(sequences), np.array(lcl_ids)

In [21]:
# Load and preprocess dataset_B (training data)
df_train, scaler_train = load_and_preprocess_data(dataset_B)
X_train, lcl_train = create_sequences(df_train)

# Split dataset_B into training (80%) and validation (20%)
X_train, X_val, lcl_train, lcl_val = train_test_split(X_train, lcl_train, test_size=0.2, random_state=42, shuffle=False)

In [22]:
# Define S3 bucket and file path
s3_bucket = "459-team4-project"
s3_model_path = "models/autoencoder_model.h5"
local_model_path = "/tmp/autoencoder_model.h5"

# Download model from S3
s3 = boto3.client("s3")
s3.download_file(s3_bucket, s3_model_path, local_model_path)

# Load the model
autoencoder = tf.keras.models.load_model(local_model_path)

# Print model summary to verify loading
autoencoder.summary()


2025-03-21 06:19:04.095246: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [23]:
# Load and preprocess dataset_A (test data)
df_test, scaler_test = load_and_preprocess_data(dataset_A)
X_test, lcl_test = create_sequences(df_test)

# Compute reconstruction error (MSE) on test data
reconstructed = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - reconstructed, 2), axis=(1, 2))  # Compute MSE per sequence

# Reconstruct training data to calculate the threshold
train_reconstructed = autoencoder.predict(X_train)
train_mse = np.mean(np.power(X_train - train_reconstructed, 2), axis=(1, 2))

# Set the fraud detection threshold (99th percentile of training MSE)
threshold = np.percentile(train_mse, 99)

# Predict and detect fraudulent LCLids
test_reconstructed = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - test_reconstructed, 2), axis=(1, 2))

# Identify fraudulent LCLids
fraud_mask = mse > threshold
autoencoder_outliers = set(lcl_test[fraud_mask])

print(f"Detected fraudulent LCLids: {autoencoder_outliers}")

2025-03-21 06:19:15.640523: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1469891640 exceeds 10% of free system memory.


[1m54684/54684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m345s[0m 6ms/step
[1m   17/38881[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:19[0m 7ms/step

2025-03-21 06:25:48.940739: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1045112880 exceeds 10% of free system memory.


[1m38881/38881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 6ms/step
[1m   18/54684[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:28[0m 6ms/step

2025-03-21 06:30:28.868297: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1469891640 exceeds 10% of free system memory.


[1m54684/54684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m344s[0m 6ms/step
Detected fraudulent LCLids: {'MAC005311', 'MAC001771', 'MAC002099', 'MAC001791', 'MAC002367', 'MAC004586', 'MAC003245', 'MAC001522', 'MAC002137', 'MAC001846', 'MAC000834', 'MAC003568', 'MAC001822', 'MAC004007', 'MAC003018', 'MAC004897', 'MAC003299', 'MAC001516', 'MAC005123', 'MAC002644', 'MAC005162', 'MAC002641', 'MAC001518', 'MAC004271', 'MAC003881', 'MAC005377', 'MAC001433', 'MAC004720', 'MAC001429', 'MAC002231', 'MAC004558', 'MAC004553', 'MAC003182', 'MAC003992', 'MAC002324', 'MAC002260', 'MAC000800', 'MAC000643', 'MAC003449', 'MAC004976', 'MAC000535', 'MAC005199', 'MAC005386', 'MAC003422', 'MAC003877', 'MAC000916', 'MAC004010', 'MAC002215', 'MAC001469', 'MAC004298', 'MAC004112', 'MAC002688', 'MAC001641', 'MAC004239', 'MAC005319', 'MAC002824', 'MAC003419', 'MAC005312', 'MAC001710', 'MAC001202', 'MAC004582', 'MAC000625', 'MAC003322', 'MAC005194', 'MAC004996', 'MAC001011', 'MAC002197', 'MAC001032', 'MAC00

# Combining all the list of outliers

In [24]:
k_means_outliers = set(k_means_outliers)  # Convert list to set to remove duplicates
dbscan_outliers = set(dbscan_outliers)  # Convert list to set to remove duplicates
isolation_forest_outliers = set(isolation_forest_outliers)  # Convert list to set to remove duplicates
autoencoder_outliers = set(autoencoder_outliers)  # Convert list to set to remove duplicates

all_outliers = k_means_outliers | dbscan_outliers | isolation_forest_outliers | autoencoder_outliers

In [25]:
print(len(all_outliers))

871


In [26]:
# Initialize dictionary for storing the results
outlier_dict = {}
unique_lclids_list = df['LCLid'].dropna().unique().tolist()

# Iterate through all LCLids in unique_lclids_list
for LCLid in unique_lclids_list:
    # Create an empty dictionary for this LCLid
    outlier_info = {}

    # Retrieve values from df based on LCLid
    LCLid_info = df[df['LCLid'] == LCLid][['Acorn', 'Acorn_grouped', 'file']].iloc[0]
    outlier_info['Acorn'] = LCLid_info['Acorn']
    outlier_info['Acorn_grouped'] = LCLid_info['Acorn_grouped']
    outlier_info['file'] = LCLid_info['file']

    # Check in which lists the LCLid is present
    flagged_by = []

    if LCLid in k_means_outliers:
        flagged_by.append('k_means')
    if LCLid in dbscan_outliers:
        flagged_by.append('dbscan')
    if LCLid in isolation_forest_outliers:
        flagged_by.append('isolation_forest')
    if LCLid in autoencoder_outliers:
        flagged_by.append('autoencoder')

    # If flagged by any method, add the list of flagging methods
    if flagged_by:
        outlier_info['flagged_by'] = flagged_by
    else:
        outlier_info['flagged_by'] = []

    # Add the LCLid entry to the outlier_dict
    outlier_dict[LCLid] = outlier_info

# Print the dictionary for a sample LCLid
print(outlier_dict.get(list(outlier_dict.keys())[0]))  # Printing details of the first LCLid

{'Acorn': 'ACORN-E', 'Acorn_grouped': 'Affluent', 'file': 'block_22', 'flagged_by': []}


# Exporting the Outliers info as a CSV

In [31]:
import pandas as pd
import boto3
from botocore.exceptions import NoCredentialsError

# Convert outlier_dict to a list of dictionaries for pandas
outlier_list = []

for LCLid, info in outlier_dict.items():
    # Create a dictionary for each LCLid with its values
    LCLid_info = {
        'LCLid': LCLid,
        'acorn': info['Acorn'],
        'acorn_grouped': info['Acorn_grouped'],
        'file': info['file'],
        'flagged_by': ', '.join(info['flagged_by'])  # Join list into a string
    }
    outlier_list.append(LCLid_info)

# Convert the list of dictionaries into a pandas DataFrame
outlier_df = pd.DataFrame(outlier_list)

# Export the DataFrame to a CSV file locally
outlier_df.to_csv('output_q1.csv', index=False)

print("Outlier dictionary exported to 'output_q1.csv'")

# Upload the file to S3
s3_bucket_name = '459-team4-project'
s3_folder_path = 'notebook_output_q1/output_q1.csv'

# Initialize a session using Amazon S3
s3_client = boto3.client('s3')

try:
    # Upload the file
    s3_client.upload_file('output_q1.csv', s3_bucket_name, s3_folder_path)
    print(f"File successfully uploaded to s3://{s3_bucket_name}/{s3_folder_path}")
except NoCredentialsError:
    print("Credentials not available.")
except Exception as e:
    print(f"Error uploading file: {e}")


Outlier dictionary exported to 'output_q1.csv'
File successfully uploaded to s3://459-team4-project/notebook_output_q1/output_q1.csv
