In [None]:
!pip install sweetviz
!pip install h2o

Collecting sweetviz
  Downloading sweetviz-2.2.1-py3-none-any.whl (15.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sweetviz
Successfully installed sweetviz-2.2.1
Collecting h2o
  Downloading h2o-3.44.0.1.tar.gz (257.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.4/257.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.44.0.1-py2.py3-none-any.whl size=257484150 sha256=cea3e2d0aebb1704647397b248ec7c5f7364e694d835d462b95cda375277144b
  Stored in directory: /root/.cache/pip/wheels/d9/9b/ca/7345b72d17e1e17da37239d70631c3214ec9e541b0c9e700e2
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.44.0.1


In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


**Inspecting the Dataset**

In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/content/gdrive/MyDrive/CMPE-255/Assignment 5/Data Set/Task-3/london_merged.csv')

# Display the first few rows of the dataset
data.head()


Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0


**Data Preprocessing and Cleaning**

In [None]:
# Check for missing values in the dataset
missing_data = data.isnull().sum()

# Display the missing data count for each column
missing_data


timestamp       0
cnt             0
t1              0
t2              0
hum             0
wind_speed      0
weather_code    0
is_holiday      0
is_weekend      0
season          0
dtype: int64

In [None]:
import sweetviz as sv

# Generate the Sweetviz report
report = sv.analyze(data)

# Display the report
report.show_html(filepath="/content/gdrive/MyDrive/CMPE-255/Assignment 5/Data Set/Task-3/london_eda_report.html", open_browser=False)


                                             |          | [  0%]   00:00 -> (? left)

Report /content/gdrive/MyDrive/CMPE-255/Assignment 5/Data Set/Task-3/london_eda_report.html was generated.


**clustering and anomaly detection**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
import numpy as np

# Drop the timestamp for clustering as it's a unique identifier
data_for_clustering = data.drop('timestamp', axis=1)

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_for_clustering)

# Use KMeans for clustering
kmeans = KMeans(n_clusters=5, random_state=42)
data['cluster'] = kmeans.fit_predict(data_scaled)

# Use Isolation Forest for anomaly detection
iso_forest = IsolationForest(contamination=0.05, random_state=42)
data['anomaly'] = iso_forest.fit_predict(data_scaled)

# The anomalies are denoted as -1 by the Isolation Forest model
anomalies = data[data['anomaly'] == -1]

# Number of anomalies detected
num_anomalies = anomalies.shape[0]
num_anomalies




871

**Feature Processing and Selection**

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

# Dropping the 'timestamp', 'anomaly', and 'cluster' columns for feature selection
X = data.drop(columns=['timestamp', 'anomaly', 'cluster', 'cnt'])
y = data['cnt']

# Use SelectKBest to select top features based on their relationship with the target variable
selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(X, y)

# Get feature scores
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Score': selector.scores_
}).sort_values(by='Score', ascending=False)

feature_scores


Unnamed: 0,Feature,Score
2,hum,4748.490046
0,t1,3100.802043
1,t2,2745.133007
4,weather_code,497.277531
3,wind_speed,238.71857
7,season,238.240235
6,is_weekend,163.666007
5,is_holiday,46.662016


R

In [None]:
import h2o
from h2o.automl import H2OAutoML

# Initialize the H2O cluster
h2o.init(max_mem_size="4G", nthreads=-1)

# Convert pandas dataframe to H2O frame
h2o_frame = h2o.H2OFrame(data.drop(columns=['timestamp', 'anomaly', 'cluster']))

# Splitting the data into train and test sets
train, test = h2o_frame.split_frame(ratios=[0.8], seed=42)

# Identifying predictors and response
x = train.columns
y = "cnt"
x.remove(y)

# Run AutoML for 20 base models (limited to save time)
aml = H2OAutoML(max_models=20, seed=42, max_runtime_secs=300)
aml.train(x=x, y=y, training_frame=train)

# Get the leaderboard
lb = aml.leaderboard
print(lb.head(rows=lb.nrows))


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.20.1" 2023-08-24; OpenJDK Runtime Environment (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmprx50mtg3
  JVM stdout: /tmp/tmprx50mtg3/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmprx50mtg3/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,05 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.1
H2O_cluster_version_age:,13 days
H2O_cluster_name:,H2O_from_python_unknownUser_mntme9
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
model_id                                            rmse              mse      mae      rmsle    mean_residual_deviance
GBM_3_AutoML_1_20231030_83512                    862.058  743144           614.473  nan                 743144
GBM_4_AutoML_1_20231030_83512                    862.264  743499           610.837  nan                 743499
GBM_5_AutoML_1_20231030_83512                    862.803  744430           618.57     1.13544           744430
GBM_2_AutoML_1_20231030_83512                    863.111  744961           617.062  nan                 744961
GBM_1_AutoML_1_20231030_83512                    865.387  748895           620.323    1.1333            748895
GBM_grid_1_AutoML_1_20231030_83512_model_1       867.195  752027           621.855  nan                 752027
GBM_grid_1_AutoML_1_2023