In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import time
import datetime
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [15]:
import os
import pandas as pd

# set the directory path where the csv files are located
dir_path = './'

# get a list of all csv files in the directory
csv_files = [f for f in os.listdir(dir_path) if f.endswith('.csv')]

# create an empty dictionary to store the dataframes
dfs = {}

# loop through each csv file and read it into a dataframe
for file in csv_files:
    # get the name of the file without the extension
    name = file.split('.')[0]
    # read the csv file into a dataframe
    df = pd.read_csv(os.path.join(dir_path, file))
    # add the dataframe to the dictionary with the file name as the key
    dfs[name] = df

In [16]:
# Print the shape of the dataset
weather_data = dfs['weather_data']
sensor_readings = dfs['sensor_readings']
leak_locations_and_rate = dfs['leak_locations_and_rate']
curr_df = weather_data
print("Shape of the dataset: ", curr_df.shape)

Shape of the dataset:  (86401, 6)


In [17]:
time_classification_map = {}
for index, row in leak_locations_and_rate.iterrows():
    time_classification_map[(row['tStart'], row['tEnd'])] = row["Latitude"], row["Longitude"], row["NumberSourcesLeaking"]

def get_leak_location_and_rate(time):
    for key in time_classification_map.keys():
        if key[0] <= time <= key[1]:
            return time_classification_map[key]
    return None

In [18]:
# convert timestamp to POSIX time format
weather_data['timestamp'] = pd.to_datetime(weather_data['timestamp'])
weather_data['timestamp'] = weather_data['timestamp'].apply(lambda x: x.timestamp())

# change the data type of the timestamp column to int
weather_data['timestamp'] = weather_data['timestamp'].astype(int)

weather_data.set_index('timestamp', inplace=True)

# average all rows with same timestamp
weather_data = weather_data.groupby('timestamp').mean()

  weather_data['timestamp'] = pd.to_datetime(weather_data['timestamp'])


In [19]:
# create a numpy array with the same shape as detection_input
data = np.empty((24*80000, 12), dtype=object)

# loop through each row in sensor_readings dataframe
outer_index = 0
for index, row in sensor_readings.iterrows():
    # loop through each sensor in the row
    for col in sensor_readings.columns:
        if col not in ['Unnamed: 0', 'time']:
            longitude = col.split('_')[1]
            latitude = col.split('_')[2]
            value = row[col]
            sack_of_goodies = get_leak_location_and_rate(row['time'])
            if sack_of_goodies is None:
                continue
            leak_latitude, leak_longitude, number_sources_leaking = sack_of_goodies 

            #find weather data based on the minute of the timestamp
            weather_data_row = weather_data.loc[int(row['time'] // 60 * 60)]
            
            #unpack weather data
            temperature = weather_data_row['Temperature']
            humidity = weather_data_row['Humidity']
            pressure = weather_data_row['Barometric_Pressure']
            wind_direction = weather_data_row['Wind_Direction']
            wind_speed = weather_data_row['Wind_Speed']
    
            data[outer_index] = [row['time'], latitude, longitude, value, temperature, humidity, pressure, wind_direction, wind_speed, leak_latitude, leak_longitude, number_sources_leaking]
            outer_index += 1
            
    if index % 10000 == 0:
        print(index)

#convert to dataframe
data = pd.DataFrame(data, columns=['time', 'latitude', 'longitude', 'value', 'temperature', 'humidity', 'pressure', 'wind_direction', 'wind_speed', 'leak_latitude', 'leak_longitude', 'number_sources_leaking'])

#drop rows with NaN values
data = data.dropna()

#convert to numpy array
data = data.to_numpy()

#normalize latitude and longitude
# max_latitude = np.max(data[:, 9] + data[:, 1].astype(float))
# min_latitude = np.min(data[:, 9] + data[:, 1].astype(float))
# max_longitude = np.max(data[:, 10] + data[:, 2].astype(float))
# min_longitude = np.min(data[:, 10] + data[:, 2].astype(float))

# data[:, 9] = (data[:, 9] - min_latitude) / (max_latitude - min_latitude)
# data[:, 10] = (data[:, 10] - min_longitude) / (max_longitude - min_longitude)
# data[:,1] = (data[:,1].astype(float) - min_latitude) / (max_latitude - min_latitude)
# data[:,2] = (data[:,2].astype(float) - min_longitude) / (max_longitude - min_longitude)

#convert to dataframe
location_input = pd.DataFrame(data, columns=['time', 'latitude', 'longitude', 'value', 'temperature', 'humidity', 'pressure', 'wind_direction', 'wind_speed', 'leak_latitude', 'leak_longitude', 'number_sources_leaking'])

0
10000
20000
30000
40000
50000
60000


In [None]:
location_input #just location data but from 

Unnamed: 0,time,latitude,longitude,value,temperature,humidity,pressure,wind_direction,wind_speed,leak_latitude,leak_longitude,number_sources_leaking
0,1681776035.0,-105.14055,40.595561,1054.0,10.737401,31.740852,827.06209,320.568991,1.784585,40.595924,-105.13939,2
1,1681776035.0,-105.140583,40.596108,1110.0,10.737401,31.740852,827.06209,320.568991,1.784585,40.595924,-105.13939,2
2,1681776035.0,-105.140069,40.595556,1242.5,10.737401,31.740852,827.06209,320.568991,1.784585,40.595924,-105.13939,2
3,1681776035.0,-105.140075,40.596114,973.0,10.737401,31.740852,827.06209,320.568991,1.784585,40.595924,-105.13939,2
4,1681776035.0,-105.140583,40.596108,1013.0,10.737401,31.740852,827.06209,320.568991,1.784585,40.595924,-105.13939,2
...,...,...,...,...,...,...,...,...,...,...,...,...
600187,1681853938.0,-105.139678,40.596097,1047.0,16.383054,17.506307,821.295929,296.573505,8.572772,40.595943,-105.13943,3
600188,1681853938.0,-105.139211,40.595542,1010.0,16.383054,17.506307,821.295929,296.573505,8.572772,40.595943,-105.13943,3
600189,1681853938.0,-105.139714,40.595547,1026.0,16.383054,17.506307,821.295929,296.573505,8.572772,40.595943,-105.13943,3
600190,1681853938.0,-105.139144,40.596089,1067.0,16.383054,17.506307,821.295929,296.573505,8.572772,40.595943,-105.13943,3


In [None]:
#sort the dataframe by time
location_input = location_input.sort_values(by=['time'])
location_input.head()

Unnamed: 0,time,latitude,longitude,value,temperature,humidity,pressure,wind_direction,wind_speed,leak_latitude,leak_longitude,number_sources_leaking
0,1681776035.0,-105.14055,40.595561,1054.0,10.737401,31.740852,827.06209,320.568991,1.784585,40.595924,-105.13939,2
23,1681776035.0,-105.139678,40.596097,1128.0,10.737401,31.740852,827.06209,320.568991,1.784585,40.595924,-105.13939,2
22,1681776035.0,-105.139144,40.596089,975.0,10.737401,31.740852,827.06209,320.568991,1.784585,40.595924,-105.13939,2
21,1681776035.0,-105.139714,40.595547,1138.0,10.737401,31.740852,827.06209,320.568991,1.784585,40.595924,-105.13939,2
20,1681776035.0,-105.139211,40.595542,1048.0,10.737401,31.740852,827.06209,320.568991,1.784585,40.595924,-105.13939,2


In [None]:
# categoririze the targets
location_input['number_sources_leaking'] = location_input['number_sources_leaking'].astype(int)
location_input['leak_latitude'] = location_input['leak_latitude'].astype(float)
location_input['leak_longitude'] = location_input['leak_longitude'].astype(float)

In [None]:
#normalize the longitude and latitude columns using min-max normalization
location_input['latitude'] = location_input['latitude'].astype(float)
location_input['longitude'] = location_input['longitude'].astype(float)

latitudeMin = location_input['latitude'].min()
latitudeMax = location_input['latitude'].max()
latitudeMid = latitudeMax-latitudeMin
longitudeMin = location_input['longitude'].min()
longitudeMax = location_input['longitude'].max()
longitudeMid = longitudeMax-longitudeMin

for x in range(0, len(location_input)):
    location_input.loc[x, 'latitude'] = (location_input.loc[x, 'latitude'] - latitudeMin) 
    location_input.loc[x, 'longitude'] = (location_input.loc[x, 'longitude'] - longitudeMin) 
    location_input.loc[x, 'leak_latitude'] = (location_input.loc[x, 'leak_latitude'] - latitudeMin) 
    location_input.loc[x, 'leak_longitude'] = (location_input.loc[x, 'leak_longitude'] - longitudeMin)
location_input.head()

Unnamed: 0,time,latitude,longitude,value,temperature,humidity,pressure,wind_direction,wind_speed,leak_latitude,leak_longitude,number_sources_leaking
0,1681776035.0,3.3e-05,1.9e-05,1054.0,10.737401,31.740852,827.06209,320.568991,1.784585,145.736507,-145.734932,2
23,1681776035.0,0.000905,0.000555,1128.0,10.737401,31.740852,827.06209,320.568991,1.784585,145.736507,-145.734932,2
22,1681776035.0,0.001439,0.000547,975.0,10.737401,31.740852,827.06209,320.568991,1.784585,145.736507,-145.734932,2
21,1681776035.0,0.000869,5e-06,1138.0,10.737401,31.740852,827.06209,320.568991,1.784585,145.736507,-145.734932,2
20,1681776035.0,0.001372,0.0,1048.0,10.737401,31.740852,827.06209,320.568991,1.784585,145.736507,-145.734932,2


In [None]:
# To normalize longitude and latitude using Min-Max normalization, we will
# adjust the range to [0, 1] for both. The longitude will be adjusted from
# [0, 360] to [0, 1], and latitude from [-90, 90] to [0, 1].

def min_max_normalize_longitude(longitude):
    # Adjust the range of longitude from [-180, 180] to [0, 360] for normalization
    adjusted_longitude = (longitude + 180) % 360
    # Normalize to [0, 1]
    return adjusted_longitude / 360

def min_max_normalize_latitude(latitude):
    # Adjust the range of latitude from [-90, 90] to [0, 180] for normalization
    adjusted_latitude = latitude + 90
    # Normalize to [0, 1]
    return adjusted_latitude / 180

# Apply the normalization to the DataFrame
location_input['longitude'] = location_input['longitude'].apply(min_max_normalize_longitude)
location_input['latitude'] = location_input['latitude'].apply(min_max_normalize_latitude)
location_input['leak_longitude'] = location_input['longitude'].apply(min_max_normalize_longitude)
location_input['leak_latitude'] = location_input['latitude'].apply(min_max_normalize_latitude)

location_input.head()

Unnamed: 0,time,latitude,longitude,value,temperature,humidity,pressure,wind_direction,wind_speed,leak_latitude,leak_longitude,number_sources_leaking
0,1681776035.0,0.5,0.5,1054.0,10.737401,31.740852,827.06209,320.568991,1.784585,0.502778,0.501389,2
23,1681776035.0,0.500005,0.500002,1128.0,10.737401,31.740852,827.06209,320.568991,1.784585,0.502778,0.501389,2
22,1681776035.0,0.500008,0.500002,975.0,10.737401,31.740852,827.06209,320.568991,1.784585,0.502778,0.501389,2
21,1681776035.0,0.500005,0.5,1138.0,10.737401,31.740852,827.06209,320.568991,1.784585,0.502778,0.501389,2
20,1681776035.0,0.500008,0.5,1048.0,10.737401,31.740852,827.06209,320.568991,1.784585,0.502778,0.501389,2


In [None]:
#import the pickle models leak location, leakand run the information on it
import pickle
X = location_input.drop(columns=['leak_latitude', 'leak_longitude', 'number_sources_leaking'])
y1 = location_input['leak_latitude']
y2 = location_input['leak_longitude']
y3 = location_input['number_sources_leaking']

# import the models
leak_latitude_regressor = pickle.load(open('Leak_Latitude_regressor.sav', 'rb'))
leak_longitude_regressor = pickle.load(open('Leak_Longitude_regressor.sav', 'rb'))
num_leaks_regressor = pickle.load(open('Num_Leaks_regressor.sav', 'rb'))

# run the models
leak_latitude = leak_latitude_regressor.predict(X)
leak_longitude = leak_longitude_regressor.predict(X)
number_leaks = num_leaks_regressor.predict(X)

print(leak_latitude)




ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- time


In [13]:
import sys
import os
import pickle
import pandas as pd
import numpy as np
import sklearn

def open_csv(sensor_file_path):
    sensor_df = pd.read_csv(sensor_file_path)
    return sensor_df

def process_weather_data(weather_data):
    # convert timestamp to POSIX time format
    weather_data['timestamp'] = pd.to_datetime(weather_data['timestamp'])
    weather_data['timestamp'] = weather_data['timestamp'].apply(lambda x: x.timestamp())

    # change the data type of the timestamp column to int
    weather_data['timestamp'] = weather_data['timestamp'].astype(int)

    weather_data.set_index('timestamp', inplace=True)

    # average all rows with same timestamp
    weather_data = weather_data.groupby('timestamp').mean()
    return weather_data

def min_max_normalize_longitude(longitude):
    # Adjust the range of longitude from [-180, 180] to [0, 360] for normalization
    adjusted_longitude = (int(longitude) + 180) % 360
    # Normalize to [0, 1]
    return adjusted_longitude / 360

def min_max_normalize_latitude(latitude):
    # Adjust the range of latitude from [-90, 90] to [0, 180] for normalization
    adjusted_latitude = int(latitude) + 90
    # Normalize to [0, 1]
    return adjusted_latitude / 180

def process_sensor_data(sensor_df, weather_data):
   # create a numpy array with the same shape as detection_input
    data = np.empty((24*sensor_df.shape[0], 9), dtype=object)

    # loop through each row in sensor_readings dataframe
    outer_index = 0
    for index, row in sensor_df.iterrows():
        # loop through each sensor in the row
        for col in sensor_df.columns:
            if col not in ['Unnamed: 0', 'time']:
                longitude = col.split('_')[1]
                latitude = col.split('_')[2]
                value = row[col]

                #find weather data based on the minute of the timestamp
                weather_data_row = weather_data.loc[int(row['time'] // 60 * 60)]
                
                #unpack weather data
                temperature = weather_data_row['Temperature']
                humidity = weather_data_row['Humidity']
                pressure = weather_data_row['Barometric_Pressure']
                wind_direction = weather_data_row['Wind_Direction']
                wind_speed = weather_data_row['Wind_Speed']
        
                data[outer_index] = [row['time'], latitude, longitude, value, temperature, humidity, pressure, wind_direction, wind_speed]
                outer_index += 1
                
        if index % 10000 == 0:
            print(index)

    #convert to dataframe
    data = pd.DataFrame(data, columns=['time', 'latitude', 'longitude', 'value', 'temperature', 'humidity', 'pressure', 'wind_direction', 'wind_speed'])

    #drop rows with NaN values
    data = data.dropna()

    #convert to numpy array
    data = data.to_numpy()

    #convert to dataframe
    location_input = pd.DataFrame(data, columns=['time', 'latitude', 'longitude', 'value', 'temperature', 'humidity', 'pressure', 'wind_direction', 'wind_speed'])

    # To normalize longitude and latitude using Min-Max normalization, we will
    # adjust the range to [0, 1] for both. The longitude will be adjusted from
    # [0, 360] to [0, 1], and latitude from [-90, 90] to [0, 1].

    # Apply the normalization to the DataFrame
    location_input['longitude'] = location_input['longitude'].apply(min_max_normalize_longitude)
    location_input['latitude'] = location_input['latitude'].apply(min_max_normalize_latitude)

    return location_input

def load_model(model_name):
    # load the model from disk
    loaded_model = pickle.load(open(model_name, 'rb'))
    return loaded_model

def predict_location(location_input, loaded_model1, loaded_model2, loaded_model3):
#drop time column
    # make predictions on the input data
    time = location_input['time']
    location_input = location_input.drop(columns=['time'])
    
    leak_latitude = loaded_model1.predict(location_input)
    leak_longitude = loaded_model2.predict(location_input)
    number_leaks = loaded_model3.predict(location_input)

    print(leak_latitude)
    print(leak_longitude)
    print(number_leaks)
    
    return leak_latitude, leak_longitude, number_leaks


In [14]:
print("made it into here")

# get input file names
sensor_data = 'sensor_readings.csv'
model1_name = './models/Leak_Latitude_regressor.sav'
model2_name = './models/Leak_Longitude_regressor.sav'
model3_name = './models/Num_Leaks_regressor.sav'
weather = 'weather_data.csv'

# process inputs to fit models
sensor_df = open_csv(sensor_data)
weather_data = open_csv(weather)
print(weather_data)
weather_data = process_weather_data(weather_data)
print(weather_data)
location_input = process_sensor_data(sensor_df, weather_data)

# load model
loaded_model1 = load_model(model1_name)
loaded_model2 = load_model(model2_name)
loaded_model3 = load_model(model3_name)

# make predictions
list_of_leaks = predict_location(location_input, loaded_model1, loaded_model2, loaded_model3)

# print results
# print(f'List of leaks: {list_of_leaks}')

#calculate continuous intervals srtart times and end times of leaks
continuous_intervals = []
start_time = None
end_time = None
for time in list_of_leaks:
    if start_time == None:
        start_time = time
        end_time = time
    elif time == end_time + 1:
        end_time = time
    else:
        continuous_intervals.append((start_time, end_time))
        start_time = time
        end_time = time

# print results
print(f'Continuous intervals: {continuous_intervals}')

made it into here
           timestamp  Barometric_Pressure   Humidity  Temperature  \
0       4/18/23 0:00           827.064433  32.200298    10.971487   
1       4/18/23 0:00           827.073897  32.200297    11.116626   
2       4/18/23 0:00           827.301068  32.138772    10.861056   
3       4/18/23 0:00           827.073913  32.100913    11.012511   
4       4/18/23 0:00           827.092847  31.999946    10.969914   
...              ...                  ...        ...          ...   
86396  4/18/23 23:59           822.492586  36.295757    11.316986   
86397  4/18/23 23:59           822.464176  36.295757    11.122942   
86398  4/18/23 23:59           822.473646  36.295757    11.585178   
86399  4/18/23 23:59           822.568260  36.311528    11.722426   
86400  4/18/23 23:59           822.511463  36.298905    11.616725   

       Wind_Direction  Wind_Speed  
0          306.335774    1.836502  
1          311.523873    1.875021  
2          324.694315    1.966547  
3        

TypeError: can only concatenate str (not "int") to str