<a href="https://colab.research.google.com/github/sheddie00/learn-git/blob/main/SupervisedCheckpoint_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Working on the '5G-Energy consumption' dataset

The objective is to build and train a ML model to estimate the energy consumed by different 5G base stations taking into consideration the impact of various engineering configurations, traffic conditions, and energy-saving methods.

In [None]:
# Load the dataset and perform EDA
from google.colab import files
uploaded = files.upload()

Saving 5G_energy_consumption_dataset.csv to 5G_energy_consumption_dataset.csv


In [None]:
import pandas as pd

df = pd.read_csv('5G_energy_consumption_dataset.csv')
display(df.head())

Unnamed: 0,Time,BS,Energy,load,ESMODE,TXpower
0,20230101 010000,B_0,64.275037,0.487936,0.0,7.101719
1,20230101 020000,B_0,55.904335,0.344468,0.0,7.101719
2,20230101 030000,B_0,57.698057,0.193766,0.0,7.101719
3,20230101 040000,B_0,55.156951,0.222383,0.0,7.101719
4,20230101 050000,B_0,56.053812,0.175436,0.0,7.101719


 # Display General Information About the Dataset

In [None]:
import pandas as pd

# Show basic structure and data types
df.info()

# Show summary statistics
df.describe()

# Show first few rows
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92629 entries, 0 to 92628
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Time     92629 non-null  object 
 1   BS       92629 non-null  object 
 2   Energy   92629 non-null  float64
 3   load     92629 non-null  float64
 4   ESMODE   92629 non-null  float64
 5   TXpower  92629 non-null  float64
dtypes: float64(4), object(2)
memory usage: 4.2+ MB


Unnamed: 0,Time,BS,Energy,load,ESMODE,TXpower
0,20230101 010000,B_0,64.275037,0.487936,0.0,7.101719
1,20230101 020000,B_0,55.904335,0.344468,0.0,7.101719
2,20230101 030000,B_0,57.698057,0.193766,0.0,7.101719
3,20230101 040000,B_0,55.156951,0.222383,0.0,7.101719
4,20230101 050000,B_0,56.053812,0.175436,0.0,7.101719


 Step 2: Create a Pandas Profiling Report

In [None]:
!pip install ydata-profiling
from ydata_profiling import ProfileReport

# Create the report
profile = ProfileReport(df, title="5G Energy Consumption Profiling Report", explorative=True)

# Display it inside the notebook
profile.to_notebook_iframe()

Collecting ydata-profiling
  Downloading ydata_profiling-4.16.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting scipy<1.16,>=1.4.1 (from ydata-profiling)
  Downloading scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting visions<0.8.2,>=0.7.5 (from visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling)
  Downloading visions-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata-profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik<0.13,>=0.11.1 (from ydata-profiling)
  Downloading phik-0.12.5-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting imagehash==4.3.1 (from ydata-

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:00<00:02,  2.33it/s][A
100%|██████████| 6/6 [00:00<00:00,  8.21it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Step 3: Handle Missing and Corrupted Values

In [None]:
# Check for missing values
df.isnull().sum()

# Option 1: Drop rows with any missing values (if few)
# df = df.dropna()

# Option 2: Fill missing values (if many)
# df.fillna(df.median(), inplace=True)  # or use df.mean(), or a constant


Unnamed: 0,0
Time,0
BS,0
Energy,0
load,0
ESMODE,0
TXpower,0


Step 4: Remove Duplicates

In [None]:
# Check for duplicates
print(f"Duplicates: {df.duplicated().sum()}")

# Remove them
df = df.drop_duplicates()


Duplicates: 0


Step 5: Handle Outliers

In [None]:
# Remove outliers using IQR method for all numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]


Step 6: Encode Categorical Features

In [None]:
# Identify categorical columns
cat_cols = df.select_dtypes(include='object').columns

# Apply one-hot encoding
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)


# Select Target and Features

Assuming the dataset includes a column like Energy_Consumption (or similar), we’ll define:

Target variable → what we’re trying to predict

Features → all other columns that influence the prediction

In [None]:
# Select target and features
target = 'Energy'
features = [col for col in df_encoded.columns if col != target]

In [None]:
# The 'Time' column was already dropped during one-hot encoding.
# We can directly define the target and features from the df_encoded DataFrame.

# Step 1: Define target and features
target = 'Energy'
features = [col for col in df_encoded.columns if col != target]

# Step 2: Split into X (features) and y (target)
X = df_encoded[features]
y = df_encoded[target]

print("Features and target have been successfully created.")

Features and target have been successfully created.


# Split the Data into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split

# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


Training set size: (64982, 1052)
Test set size: (16246, 1052)


# Selected a ML regression algorithm and trained it on the training set

From earlier steps (profiling, checking types, and columns like load, TXpower, etc.), your dataset:

Has numeric features

Shows no strong nonlinearity at this stage

Is a good fit for a linear model to start with

So a good first choice is:  Linear Regression



In [None]:
from sklearn.linear_model import LinearRegression

# Step 1: Create the model
model = LinearRegression()

# Step 2: Train the model
model.fit(X_train, y_train)

# Step 3: View coefficients (optional)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)


Intercept: -374.74641084332575
Coefficients: [ 1.99150020e+01 -8.33466629e-12  5.99121681e+01 ... -4.79384814e+01
 -4.71190700e+01 -3.20781904e+01]


# Assess the model performance on the test set using relevant evaluation metrics

In [None]:
# Step 1: # Predict using the trained model
y_pred = model.predict(X_test)


In [None]:
# step 2: Calculate evaluation metrics

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f" Mean Absolute Error (MAE): {mae:.2f}")
print(f" Mean Squared Error (MSE): {mse:.2f}")
print(f" Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f" R² Score: {r2:.2f}")


 Mean Absolute Error (MAE): 1.74
 Mean Squared Error (MSE): 5.92
 Root Mean Squared Error (RMSE): 2.43
 R² Score: 0.96


Proposed alternative ways to improve my model performance

 Tried Regularized Models

In [None]:
from sklearn.linear_model import Ridge, Lasso

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)


In [None]:
# Make prediction after fiting the lasso
y_pred_ridge = ridge.predict(X_test)
y_pred_lasso = lasso.predict(X_test)
# Print them out
print(f"Ridge R² Score: {r2_score(y_test, y_pred_ridge):.2f}")
print(f"Lasso R² Score: {r2_score(y_test, y_pred_lasso):.2f}")

Ridge R² Score: 0.96
Lasso R² Score: 0.46
