In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [2]:
data = pd.read_csv("Steel_industry_data.csv")
print(data.isnull().sum())  # Count of missing values in each column
# data['date'] = pd.to_datetime(data['date'])
# y_train=data['Usage_kWh'].values
data = data.drop(columns=['date'])
X_train=data.values

data.head()


date                                    0
Usage_kWh                               0
Lagging_Current_Reactive.Power_kVarh    0
Leading_Current_Reactive_Power_kVarh    0
CO2(tCO2)                               0
Lagging_Current_Power_Factor            0
Leading_Current_Power_Factor            0
NSM                                     0
WeekStatus                              0
Day_of_week                             0
Load_Type                               0
dtype: int64


Unnamed: 0,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Day_of_week,Load_Type
0,3.17,2.95,0.0,0.0,73.21,100.0,900,Weekday,Monday,Light_Load
1,4.0,4.46,0.0,0.0,66.77,100.0,1800,Weekday,Monday,Light_Load
2,3.24,3.28,0.0,0.0,70.28,100.0,2700,Weekday,Monday,Light_Load
3,3.31,3.56,0.0,0.0,68.09,100.0,3600,Weekday,Monday,Light_Load
4,3.82,4.5,0.0,0.0,64.72,100.0,4500,Weekday,Monday,Light_Load


In [3]:
print(X_train)

[[3.17 2.95 0.0 ... 'Weekday' 'Monday' 'Light_Load']
 [4.0 4.46 0.0 ... 'Weekday' 'Monday' 'Light_Load']
 [3.24 3.28 0.0 ... 'Weekday' 'Monday' 'Light_Load']
 ...
 [3.78 3.17 0.07 ... 'Weekday' 'Monday' 'Light_Load']
 [3.78 3.06 0.11 ... 'Weekday' 'Monday' 'Light_Load']
 [3.67 3.02 0.07 ... 'Weekday' 'Monday' 'Light_Load']]


In [4]:
scaler = StandardScaler()

# Perform scaling and assign it after casting the columns to float
X_train[:, 1:7] = scaler.fit_transform(X_train[:, 1:7]).astype('float64')


print(X_train)
X_train.shape
# df=pd.DataFrame(X_train)
# df.to_csv('scaled_Steel_industry_data.csv', index=False, header=True)

[[3.17 -0.6185163432975694 -0.5213850478056785 ... 'Weekday' 'Monday'
  'Light_Load']
 [4.0 -0.5259110722465691 -0.5213850478056785 ... 'Weekday' 'Monday'
  'Light_Load']
 [3.24 -0.598278105253311 -0.5213850478056785 ... 'Weekday' 'Monday'
  'Light_Load']
 ...
 [3.78 -0.6050241846013972 -0.5119566216410291 ... 'Weekday' 'Monday'
  'Light_Load']
 [3.78 -0.6117702639494832 -0.5065689495469436 ... 'Weekday' 'Monday'
  'Light_Load']
 [3.67 -0.6142233837124237 -0.5119566216410291 ... 'Weekday' 'Monday'
  'Light_Load']]


(35040, 10)

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[7,8])],remainder='passthrough')
X_train=ct.fit_transform(X_train)
print(X_train)
print(X_train.shape)
# df=pd.DataFrame(X_train)
# df.to_csv('scaled_Steel_industry_data.csv', index=False, header=True)


[[1.0 0.0 0.0 ... 0.5132676163619359 -1.6780152604563736 'Light_Load']
 [1.0 0.0 0.0 ... 0.5132676163619359 -1.6419289107691397 'Light_Load']
 [1.0 0.0 0.0 ... 0.5132676163619359 -1.6058425610819058 'Light_Load']
 ...
 [1.0 0.0 0.0 ... 0.5126109334762843 1.6780152604563736 'Light_Load']
 [1.0 0.0 0.0 ... 0.5119542505906323 1.7141016101436073 'Light_Load']
 [1.0 0.0 0.0 ... 0.5126109334762843 -1.7141016101436073 'Light_Load']]
(35040, 17)


In [6]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
X_train[:,-1]=le.fit_transform(X_train[:,-1])
print(X_train)
type(X_train)
df=pd.DataFrame(X_train)
df.to_csv('scaled_Steel_industry_data.csv', index=False, header=True)

[[1.0 0.0 0.0 ... 0.5132676163619359 -1.6780152604563736 0]
 [1.0 0.0 0.0 ... 0.5132676163619359 -1.6419289107691397 0]
 [1.0 0.0 0.0 ... 0.5132676163619359 -1.6058425610819058 0]
 ...
 [1.0 0.0 0.0 ... 0.5126109334762843 1.6780152604563736 0]
 [1.0 0.0 0.0 ... 0.5119542505906323 1.7141016101436073 0]
 [1.0 0.0 0.0 ... 0.5126109334762843 -1.7141016101436073 0]]


In [7]:
# Ensure data_scaled is converted to a NumPy array before splitting
node_data = np.array_split(X_train, 8)


## SVR Implementation


In [8]:


# Function to perform Federated Averaging for SVR models
def federated_averaging(models):
    """Aggregate SVR model weights using the average."""
    # Aggregate intercepts
    intercept_avg = np.mean([model.intercept_ for model in models])

    # Collect and stack support vectors and dual coefficients
    support_vectors = np.vstack([model.support_vectors_ for model in models])
    dual_coefs = np.hstack([model.dual_coef_.ravel() for model in models])  # Flatten for consistency

    # Ensure support vectors and dual coefficients match in size
    min_size = min(support_vectors.shape[0], dual_coefs.shape[0])
    support_vectors = support_vectors[:min_size]
    dual_coefs = dual_coefs[:min_size]

    # Normalize dual coefficients
    dual_coefs_avg = dual_coefs / len(models)

    return support_vectors, dual_coefs_avg, intercept_avg

# Function to train the local SVR model on each node's data
def train_local_model(data):
    """Train the local SVR model."""
    y = data[:, 11]  # Target (Usage_kWh)
    X = np.concatenate((data[:, :11], data[:, 12:]), axis=1)  # Features

    # Initialize and train SVR model
    model = SVR(kernel='rbf')
    model.fit(X, y)
    return model

# Federated Learning Process
global_model = SVR(kernel='rbf')

for iteration in range(3):
    print(f"Iteration {iteration + 1}:")

    local_models = []

    # Train local models for each node
    for i, node in enumerate(node_data):
        start = iteration * 1460
        end = start + 1460
        node_subset = node[start:end]

        local_model = train_local_model(node_subset)
        local_models.append(local_model)

        # Evaluate local model
        X_eval = np.concatenate((node_subset[:, :11], node_subset[:, 12:]), axis=1)
        y_eval = node_subset[:, 11]
        y_pred = local_model.predict(X_eval)
        mse = mean_squared_error(y_eval, y_pred)
        print(f"  Node {i + 1} - MSE: {mse:.4f}")

    # Aggregate local model updates
    support_vectors, dual_coefs_avg, intercept_avg = federated_averaging(local_models)

    # Re-train global model using aggregated data
    global_model.fit(support_vectors, dual_coefs_avg)  # Fit with matching sizes
    global_model.intercept_ = intercept_avg

    print(f"Global model updated after iteration {iteration + 1}.\n")

# Evaluate final global model
y_pred = global_model.predict(X_train)
final_mse = mean_squared_error(y_train, y_pred)
print(f"Final Global Model MSE on Test Data: {final_mse:.4f}")


Iteration 1:
  Node 1 - MSE: 0.3363
  Node 2 - MSE: 0.1602
  Node 3 - MSE: 0.1396
  Node 4 - MSE: 0.1053
  Node 5 - MSE: 0.1872
  Node 6 - MSE: 0.0412
  Node 7 - MSE: 0.0419
  Node 8 - MSE: 0.1017
Global model updated after iteration 1.

Iteration 2:
  Node 1 - MSE: 0.5122
  Node 2 - MSE: 0.2017
  Node 3 - MSE: 0.1401
  Node 4 - MSE: 0.0777
  Node 5 - MSE: 0.1319
  Node 6 - MSE: 0.0424
  Node 7 - MSE: 0.0409
  Node 8 - MSE: 0.0454
Global model updated after iteration 2.

Iteration 3:
  Node 1 - MSE: 0.3128
  Node 2 - MSE: 0.1106
  Node 3 - MSE: 0.1305
  Node 4 - MSE: 0.0900
  Node 5 - MSE: 0.0613
  Node 6 - MSE: 0.0357
  Node 7 - MSE: 0.0520
  Node 8 - MSE: 0.0579
Global model updated after iteration 3.



ValueError: X has 17 features, but SVR is expecting 16 features as input.

## FL using SVR

In [None]:
# Initialize the global SVR model
global_model = SVR(kernel='rbf')

for iteration in range(3):
    print(f"Iteration {iteration + 1}:")

    local_models = []

    # Train each node's local model on a subset of 1460 rows
    for i, node in enumerate(node_data):
        start = iteration * 1460
        end = start + 1460
        node_subset = node[start:end]

        # Train the local SVR model
        local_model = train_local_model(node_subset)
        local_models.append(local_model)

        # Evaluate the local model
        X_eval = np.concatenate((node_subset[:, :11], node_subset[:, 12:]), axis=1)
        y_eval = node_subset[:, 11]
        y_pred = local_model.predict(X_eval)
        mse = mean_squared_error(y_eval, y_pred)
        print(f"  Node {i + 1} - MSE: {mse:.4f}")

    # Aggregate local model updates using Federated Averaging
    support_vectors, dual_coefs_avg, intercept_avg = federated_averaging(local_models)

    # Re-train the global model using aggregated data
    global_model.fit(support_vectors, dual_coefs_avg[0])  # Fit with aggregated data
    global_model.intercept_ = intercept_avg  # Set the aggregated intercept

    print(f"Global model updated after iteration {iteration + 1}.\n")

# Final evaluation on the test dataset
y_pred = global_model.predict(X_test)
final_mse = mean_squared_error(y_test, y_pred)
print(f"Final Global Model MSE on Test Data: {final_mse:.4f}")


In [None]:
# Final Evaluation (Optional): Use some test data for evaluation
test_data = node_data[0][:1460]  # Example: Using first 1460 rows from node 1 as test data
# X_test = test_data[:, 1:]
# y_test = test_data[:, 1]
test_data

y_test = test_data[:, 11]   # Target (Usage_kWh)
    # Select all columns except the 12th column using slicing
X_test= np.concatenate((test_data[:, :11], test_data[:, 12:]), axis=1)

df=pd.DataFrame(y_test)
df.to_csv('test_data.csv', index=False, header=True)

y_pred = global_model.predict(X_test)
final_mse = mean_squared_error(y_test, y_pred)
print(f"Final Global Model MSE on Test Data: {final_mse:.4f}")

In [13]:
output=np.concatenate((y_pred.reshape(-1,1),y_test.reshape(-1,1)),1)
type(output)
df=pd.DataFrame(output)
df.to_csv("output.csv", index=False, header=True)

In [None]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print(r2)
#0.8983988342396683 for Linear Reg


In [15]:
import matplotlib.pyplot as plt

In [None]:
plt.scatter(y_pred,y_test,color='red')

plt.title("FL using Linear reg")
plt.xlabel('y_pred')
plt.ylabel('y_test')
plt.show()

## Without FL

In [30]:

without_FL=X_train

X_withoutFL= np.concatenate((without_FL[:, :11], without_FL[:, 12:]), axis=1)
y_withoutFL = without_FL[:, 11]   # Target (Usage_kWh)


In [31]:
from sklearn.model_selection import train_test_split

X_withoutFL_train, X_withoutFL_test,y_withoutFL_train,y_withoutFL_test = train_test_split(X_withoutFL,y_withoutFL,test_size=0.25,random_state=0)

In [None]:
X_withoutFL_test.shape

In [None]:
X_withoutFL_train.shape

In [None]:
y_withoutFL_test.shape

In [None]:
y_withoutFL_train.shape

In [36]:
#SVR
SVRregressor = SVR(kernel='rbf')

In [None]:
# regressor.fit(X_withoutFL_train,y_withoutFL_train)
SVRregressor.fit(X_withoutFL_train,y_withoutFL_train)

In [None]:
pred=SVRregressor.predict(X_withoutFL_test)

print(pred)

In [None]:
plt.scatter(pred,y_withoutFL_test,color='red')

plt.title("Training the same dataset without FL (SVR)")
plt.xlabel('predicted')
plt.ylabel('actual')
plt.show()

## without FL


In [None]:
plt.scatter(pred,y_withoutFL_test,color='red')

plt.title("Training the same dataset without FL (regular LR)")
plt.xlabel('predicted')
plt.ylabel('actual')
plt.show()

In [None]:
pred.shape

In [None]:
from sklearn.metrics import r2_score
r2 = r2_score(y_withoutFL_test, pred)
print(r2)