In [143]:
import requests
import pandas as pd
from tqdm import tqdm

base_url = "http://agbc-fe.pdn.ac.lk/api/v1/data/?sensor=10008&date="

start_date = pd.to_datetime("2020-10-22")
end_date = pd.to_datetime("2020-12-30")

date_range = pd.date_range(start=start_date, end=end_date, freq="D")

all_data = []

for date in tqdm(date_range, desc="Progress", unit="day"):
    date_str = date.strftime("%Y-%m-%d")
    url = base_url + date_str

    try:
        response = requests.get(url)
        data = response.json()
        all_data.extend(data['data'])
    except:
        print(f"Error: Could not retrieve data for date {date_str}")
        continue
    

df = pd.DataFrame(all_data, dtype=str)
df.to_csv('dataws.csv', index=False)


Progress:  14%|█▍        | 10/70 [00:05<00:26,  2.30day/s]

Error: Could not retrieve data for date 2020-10-31


Progress: 100%|██████████| 70/70 [00:43<00:00,  1.61day/s]


In [144]:
# check for missing values
print(df.isnull().sum())

siteId       0
seqNo        0
date         0
time         0
temp1        0
temp2        0
temp3        0
humidity1    0
humidity2    0
humidity3    0
light        0
dtype: int64


In [145]:
# drop rows with missing values
df.dropna(inplace=True)

In [146]:
# Drop duplicate rows
df=df.drop_duplicates(keep='first')

In [147]:
import numpy as np
# Replace '?' with NaN

df.replace(' ?', np.nan, inplace=True)



print(df.tail(10))


       siteId  seqNo         date       time  temp1  temp2  temp3 humidity1  \
156689      0   2869   2020-12-30   23:55:03    NaN     24   22.9       NaN   
156690      0   2870   2020-12-30   23:55:31   22.8   24.1   22.9      99.9   
156691      0   2871   2020-12-30   23:56:01   22.8     24   22.8      99.9   
156692      0   2872   2020-12-30   23:56:33    NaN     24   22.8       NaN   
156693      0   2873   2020-12-30   23:57:03    NaN   23.9   22.7       NaN   
156694      0   2874   2020-12-30   23:57:33    NaN     24   22.8       NaN   
156695      0   2875   2020-12-30   23:58:01   22.8     24   22.8      99.9   
156696      0   2876   2020-12-30   23:58:31   22.8     24   22.8      99.9   
156697      0   2877   2020-12-30   23:59:01   22.8     24   22.8      99.9   
156698      0   2878   2020-12-30   23:59:33    NaN   23.9   22.8       NaN   

       humidity2 humidity3   light  
156689        95        95   1.042  
156690        95        95   1.042  
156691        95   

In [148]:
# Drop rows containing missing values


In [149]:
# Convert temperature columns to numeric
df['temp1'] = pd.to_numeric(df['temp1'], errors='coerce')
df['temp2'] = pd.to_numeric(df['temp2'], errors='coerce')
df['temp3'] = pd.to_numeric(df['temp3'], errors='coerce')

# Convert temperature columns to numeric
df['humidity1'] = pd.to_numeric(df['humidity1'], errors='coerce')
df['humidity2'] = pd.to_numeric(df['humidity2'], errors='coerce')
df['humidity3'] = pd.to_numeric(df['humidity3'], errors='coerce')

df['seqNo'] = pd.to_numeric(df['seqNo'], errors='coerce')

# Calculate the average temperature
df['average_internal_temp'] = df[['temp1', 'temp2', 'temp3']].mean(axis=1,skipna=True)

# Calculate the average humidity
df['average_internal_humidity'] = df[['humidity1', 'humidity2', 'humidity3']].mean(axis=1,skipna=True)

# Create a new DataFrame with only the desired columns
new_df = df[['seqNo','date','time','average_internal_temp', 'average_internal_humidity', 'light']]


print(new_df.head())



   seqNo         date       time  average_internal_temp  \
0      1   2020-10-22   00:00:01              23.433333   
1      2   2020-10-22   00:00:18              23.550000   
2      3   2020-10-22   00:00:31              23.433333   
3      4   2020-10-22   00:00:46              23.466667   
4      5   2020-10-22   00:01:01              23.400000   

   average_internal_humidity   light  
0                  96.300000   1.042  
1                  95.000000   1.042  
2                  96.266667   1.042  
3                  96.200000   1.042  
4                  96.166667   1.042  


<h2> Create a Data frame for Internal Sensor 10008 data </h2>

In [150]:
# Combine the 'date' and 'time' columns into a single datetime column
new_df['datetime'] = pd.to_datetime(new_df['date'] + ' ' + new_df['time'])
# Set the 'time' column as the DataFrame index
new_df.set_index('datetime', inplace=True)
new_df.drop(['date', 'time','seqNo'], axis=1, inplace=True)
# Resample the DataFrame using 'H' offset alias and select the first entry from each hour
new_df_hourly = new_df.resample('H').first()

# new_df_hourly.reset_index(inplace=True)
# Print the resulting DataFrame
print(new_df_hourly.head())

new_df_hourly.to_csv('sensor10008.csv', index=False)

                     average_internal_temp  average_internal_humidity   light
datetime                                                                     
2020-10-22 00:00:00              23.433333                  96.300000   1.042
2020-10-22 01:00:00              23.300000                  96.366667   1.042
2020-10-22 02:00:00              23.000000                  95.000000   1.042
2020-10-22 03:00:00              22.600000                  96.633333   1.042
2020-10-22 04:00:00              22.900000                  95.000000   1.042


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['datetime'] = pd.to_datetime(new_df['date'] + ' ' + new_df['time'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.drop(['date', 'time','seqNo'], axis=1, inplace=True)


<h2> Create a Data frame for External Environmental data </h2>

In [151]:
# Load the CSV file into a DataFrame
external_weather = pd.read_csv('weather_data.csv')

# Combine the 'Date' and 'Time' columns into a single datetime column
external_weather['datetime'] = pd.to_datetime(external_weather['Date'] + ' ' + external_weather['Time'])

external_weather.drop(["Time","Date"],axis=1,inplace=True)

external_weather.set_index('datetime', inplace=True)

merged_df = pd.merge(external_weather, new_df_hourly, on='datetime')

# Drop rows with any null values
merged_df.dropna(inplace=True)

merged_df.to_csv('data_set.csv')

print(merged_df.head())




                     External Temperature  Feels Like  Pressure  \
datetime                                                          
2020-10-22 00:00:00                 22.69       23.21      1009   
2020-10-22 01:00:00                 22.71       23.46      1011   
2020-10-22 02:00:00                 25.06       25.89      1012   
2020-10-22 03:00:00                 25.66       26.24      1011   
2020-10-22 04:00:00                 25.83       26.77      1012   

                     External Humidity  Dew Point  Clouds  Wind Speed  \
datetime                                                                
2020-10-22 00:00:00                 84      19.85      40        0.00   
2020-10-22 01:00:00                 93      21.52      70        2.34   
2020-10-22 02:00:00                 87      22.74      69        2.51   
2020-10-22 03:00:00                 75      20.90      40        1.50   
2020-10-22 04:00:00                 88      23.69      65        4.38   

                  

In [152]:

from sklearn.model_selection import train_test_split

columns_to_drop = ['average_internal_temp', 'average_internal_humidity', 'light', 'Clouds', 'Wind Speed','Description']
X = merged_df.drop(columns_to_drop, axis=1)
print(X.dtypes)
y = merged_df[['average_internal_temp', 'average_internal_humidity', 'light']]




External Temperature    float64
Feels Like              float64
Pressure                  int64
External Humidity         int64
Dew Point               float64
dtype: object


In [153]:
# Splitting dataset into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, )

In [154]:
# Training the linear regression model
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [155]:
model.fit(X_train, y_train)

In [156]:
model.score(X_test,y_test)

0.26777946278289905

In [159]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

# Assuming you have your input features in X and output features in y

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Create the decision tree regressor object
clf = DecisionTreeRegressor(max_depth=4, random_state=0)

# Fit the model to the training data
clf.fit(X_train, y_train)

In [161]:
from sklearn.metrics import r2_score
y_pred = clf.predict(X_test)
     
score = r2_score(y_test, y_pred)
print("Model Score:", score)


Model Score: 0.31102148585410255
