In [None]:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
base_url = "http://agbc-fe.pdn.ac.lk/api/v1/data/?sensor=10008&date="

start_date = pd.to_datetime("2020-10-22")
end_date = pd.to_datetime("2021-02-05")

date_range = pd.date_range(start=start_date, end=end_date, freq="D")

all_data = []

def fetch_data(date):
    date_str = date.strftime("%Y-%m-%d")
    url = base_url + date_str

    try:
        response = requests.get(url)
        data = response.json()
        return data['data']
    except:
        print(f"Error: Could not retrieve data for date {date_str}")
        return []

start_time = time.time()  # Get the current time before starting the execution


# Create a ThreadPoolExecutor with the maximum number of workers
executor = ThreadPoolExecutor(max_workers=None)

# Use tqdm to track the progress
with tqdm(total=len(date_range), desc="Progress", unit="day") as pbar:
    # Submit the fetch_data task to the executor for each date in parallel
    futures = [executor.submit(fetch_data, date) for date in date_range]
    
    # Retrieve the results from the completed futures
    for future in futures:
        all_data.extend(future.result())
        pbar.update(1)
    

end_time = time.time()  # Get the current time after finishing the execution
execution_time = end_time - start_time

print("Execution Time:", execution_time, "seconds")

# Create the DataFrame from the collected data
df = pd.DataFrame(all_data, dtype=str)
df.to_csv('dataws.csv', index=False)


In [None]:
# check for missing values
print(df.isnull().sum())

In [None]:
# drop rows with missing values
df.dropna(inplace=True)

In [None]:
# Drop duplicate rows
df=df.drop_duplicates(keep='first')

In [None]:
import numpy as np
# Replace '?' with NaN

df.replace(' ?', np.nan, inplace=True)



print(df.tail(10))


<h2>Create a new DataFrame with Average Temperature and Average Humidity Values </h2>

In [None]:
# Convert temperature columns to numeric
df['temp1'] = pd.to_numeric(df['temp1'], errors='coerce')
df['temp2'] = pd.to_numeric(df['temp2'], errors='coerce')
df['temp3'] = pd.to_numeric(df['temp3'], errors='coerce')

# Convert temperature columns to numeric
df['humidity1'] = pd.to_numeric(df['humidity1'], errors='coerce')
df['humidity2'] = pd.to_numeric(df['humidity2'], errors='coerce')
df['humidity3'] = pd.to_numeric(df['humidity3'], errors='coerce')

df['seqNo'] = pd.to_numeric(df['seqNo'], errors='coerce')

# Calculate the average temperature
df['average_internal_temp'] = df[['temp1', 'temp2', 'temp3']].mean(axis=1,skipna=True)

# Calculate the average humidity
df['average_internal_humidity'] = df[['humidity1', 'humidity2', 'humidity3']].mean(axis=1,skipna=True)

# Create a new DataFrame with only the desired columns
new_df = df[['seqNo','date','time','average_internal_temp', 'average_internal_humidity', 'light']]


print(new_df.head())



<h2> Create a Data frame for Internal Sensor 10008 data </h2>

In [None]:
# Combine the 'date' and 'time' columns into a single datetime column
new_df['datetime'] = pd.to_datetime(new_df['date'] + ' ' + new_df['time'])
# Set the 'time' column as the DataFrame index
new_df.set_index('datetime', inplace=True)
new_df.drop(['date', 'time','seqNo'], axis=1, inplace=True)
# Resample the DataFrame using 'H' offset alias and select the first entry from each hour
new_df_hourly = new_df.resample('H').first()

# new_df_hourly.reset_index(inplace=True)
# Print the resulting DataFrame

new_df_hourly.to_csv('sensor10008.csv', index=False)

<h2> Create a Data frame for External Environmental data </h2>

In [None]:
# Load the CSV file into a DataFrame
external_weather = pd.read_csv('weather_data.csv')

# Combine the 'Date' and 'Time' columns into a single datetime column
external_weather['datetime'] = pd.to_datetime(external_weather['Date'] + ' ' + external_weather['Time'])

external_weather.drop(["Time","Date"],axis=1,inplace=True)

external_weather.set_index('datetime', inplace=True)

merged_df = pd.merge(external_weather, new_df_hourly, on='datetime')

# Drop rows with any null values
merged_df.dropna(inplace=True)

merged_df.to_csv('data_set.csv')






In [None]:
# from pandas_profiling import ProfileReport

# # Create a profile report for your DataFrame
# profile = ProfileReport(merged_df)

# # Generate the report and save it as an HTML file
# profile.to_file('profile_report.html')

In [None]:
# Specify the columns for the box plot
columns_to_plot = ['External Temperature', 'average_internal_temp','Feels Like']

# Create the box plot using seaborn
sns.boxplot(data=merged_df[columns_to_plot])

# Add labels and title
plt.xlabel('Columns')
plt.ylabel('Values')
plt.title('Box Plot')

# Show the plot
plt.show()

<h2> Extracting features and target variable </h2>

In [None]:

from sklearn.model_selection import train_test_split


y = merged_df[['average_internal_temp', 'average_internal_humidity', 'light']]
X = merged_df[['Feels Like','Pressure','External Humidity','Dew Point','Clouds','Wind Speed']]



<h2> Model Trained By  LinearRegression</h2>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Splitting dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Training the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Calculate training accuracy
train_accuracy = model.score(X_train, y_train)

# Calculate test accuracy
test_accuracy = model.score(X_test, y_test)

print("Training Accuracy:", train_accuracy)
print("Test Accuracy:",test_accuracy)




<h2> Model Trained By  DecisionTree Regressor</h2>

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import optuna
# Assuming you have your input features in X and output features in y

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)


def objective(trial):
    max_depth = trial.suggest_int('max_depth', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 10)
    
    # Create the decision tree regressor object with the suggested parameters
    clf = DecisionTreeRegressor(max_depth=max_depth, min_samples_leaf=min_samples_leaf)

    # Fit the model to the training data
    clf.fit(X_train, y_train)
    
    # Calculate the test accuracy
    test_accuracy = clf.score(X_test, y_test)
    
    return test_accuracy


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Get the best parameters from the study
best_params = study.best_params


clf = DecisionTreeRegressor(max_depth=best_params['max_depth'], min_samples_leaf=best_params['min_samples_leaf'])
clf.fit(X_train, y_train)


print("Best Max Depth is : ",best_params['max_depth'])

print("Best Min Samples Leaf is : ",best_params['min_samples_leaf'])
train_accuracy = clf.score(X_train, y_train)
test_accuracy = clf.score(X_test, y_test)

print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)



<h2> Model Trained By Lasso</h2>

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Assuming you have a pandas DataFrame 'data' containing your feature columns (X) and target column (y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Create the Lasso regression model
lasso = Lasso(alpha=0.001)  # Adjust the alpha parameter to control the degree of regularization

# Fit the model to the training data
lasso.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = lasso.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
# Get the score (coefficient of determination) on the testing data
score = lasso.score(X_test, y_test)
print("Score:", score)

<h2>Model Trained By Ridge regression </h2>

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Assuming you have your feature matrix X and target variable y

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (optional but recommended for regularization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create a Ridge regression model
ridge = Ridge(alpha=1.0)  # You can adjust the regularization strength by changing the alpha parameter

# Train the model
ridge.fit(X_train, y_train)

# Make predictions
y_pred = ridge.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R2 Score:", r2)