In [None]:
# Import Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
# mutual_info_classif, mutual_info_regression: Functions for calculating Mutual Information Between classes and the target
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load Dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DATASET.csv', nrows=650)
df.shape

(650, 8)

In [None]:
df['Time slot'].fillna(method='ffill', inplace=True)
print(df)
df.head()

    Time slot  User ID  Generation     Demand  Shiftable load  Base load  \
0       07:00      252    3.026980  -9.214290        0.000100  12.266109   
1       07:00      379    2.374124   0.247903        0.000289   0.324715   
2       07:00      434    3.015627 -13.828098        0.000105  16.843621   
3       07:00     1718    3.337699  -8.378214        0.000102  11.233431   
4       07:00     1792    2.877441  -8.902859        3.742589   8.184614   
..        ...      ...         ...        ...             ...        ...   
645     19:00     3778    0.000000  -0.348429        0.152263   0.196166   
646     19:00     3831    0.000000  -0.379027        0.063883   0.315145   
647     19:00     3893    0.000000  -1.187785        0.623232   0.564553   
648     19:00     4213    0.000000  -1.721717        0.276747   1.444970   
649     19:00     4298    0.000000  -0.497921        0.000100   0.497821   

     Consumption  Class label  
0      12.266209            2  
1       0.325004       

Unnamed: 0,Time slot,User ID,Generation,Demand,Shiftable load,Base load,Consumption,Class label
0,07:00,252,3.02698,-9.21429,0.0001,12.266109,12.266209,2
1,07:00,379,2.374124,0.247903,0.000289,0.324715,0.325004,2
2,07:00,434,3.015627,-13.828098,0.000105,16.843621,16.843725,2
3,07:00,1718,3.337699,-8.378214,0.000102,11.233431,11.233533,2
4,07:00,1792,2.877441,-8.902859,3.742589,8.184614,11.927203,2


In [None]:

numerics = ['int16', 'int32','int64', 'float16', 'float32', 'float64']
numerical_features = list(df.select_dtypes(include=numerics).columns)

In [None]:
data = df[numerical_features]

In [None]:
data.head()

Unnamed: 0,User ID,Generation,Demand,Shiftable load,Base load,Consumption,Class label
0,252,3.02698,-9.21429,0.0001,12.266109,12.266209,2
1,379,2.374124,0.247903,0.000289,0.324715,0.325004,2
2,434,3.015627,-13.828098,0.000105,16.843621,16.843725,2
3,1718,3.337699,-8.378214,0.000102,11.233431,11.233533,2
4,1792,2.877441,-8.902859,3.742589,8.184614,11.927203,2


In [None]:
# X = data
X = data.drop('Class label', axis=1)
# X = data.drop(['target','ID'], axis=1)
X.head()

Unnamed: 0,User ID,Generation,Demand,Shiftable load,Base load,Consumption
0,252,3.02698,-9.21429,0.0001,12.266109,12.266209
1,379,2.374124,0.247903,0.000289,0.324715,0.325004
2,434,3.015627,-13.828098,0.000105,16.843621,16.843725
3,1718,3.337699,-8.378214,0.000102,11.233431,11.233533
4,1792,2.877441,-8.902859,3.742589,8.184614,11.927203


In [None]:
y = data['Class label']
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Class label, dtype: int64

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((455, 6), (455,), (195, 6), (195,))

In [None]:
# Calculate Mutual Information between each feature and the target
mutual_info = mutual_info_classif(X_train.fillna(0), y_train)
mutual_info

array([0.43625281, 0.28830756, 0.19991799, 0.11108722, 0.1499729 ,
       0.13633108])

In [None]:
# Create Feature Target Mutual Information Series
mi_series = pd.Series(mutual_info)
mi_series.index = X_train.columns
mi_series.sort_values(ascending=False)

User ID           0.436253
Generation        0.288308
Demand            0.199918
Base load         0.149973
Consumption       0.136331
Shiftable load    0.111087
dtype: float64

In [None]:
#mi_series.sort_values(ascending=False).plot.bar(figsize=(20,8))

As we can see in the plot above from left to right, the features which have the most amount of mutual information is on the left and the features with the least amount of mutual information are on the right. Some of the features contribute a lot to the mutual information whereas some of them do not contribute anything. So, to select the important features from this list, we can set a threshold like maybe select the first 10 percentile of features or first 20 features etc.

To do this we can use a combination of "SelectKbest" or "SelectPercentile".

Mutual Information using Regression




In [None]:
df.head()

Unnamed: 0,Time slot,User ID,Generation,Demand,Shiftable load,Base load,Consumption,Class label
0,07:00,252,3.02698,-9.21429,0.0001,12.266109,12.266209,2
1,07:00,379,2.374124,0.247903,0.000289,0.324715,0.325004,2
2,07:00,434,3.015627,-13.828098,0.000105,16.843621,16.843725,2
3,07:00,1718,3.337699,-8.378214,0.000102,11.233431,11.233533,2
4,07:00,1792,2.877441,-8.902859,3.742589,8.184614,11.927203,2


In [None]:
# Get Numerical features from dataset# Get N
numerics = ['int16', 'int32','int64', 'float16', 'float32', 'float64']
numerical_features = list(df.select_dtypes(include=numerics).columns)

In [None]:
data = df[numerical_features]

In [None]:
data.head()

Unnamed: 0,User ID,Generation,Demand,Shiftable load,Base load,Consumption,Class label
0,252,3.02698,-9.21429,0.0001,12.266109,12.266209,2
1,379,2.374124,0.247903,0.000289,0.324715,0.325004,2
2,434,3.015627,-13.828098,0.000105,16.843621,16.843725,2
3,1718,3.337699,-8.378214,0.000102,11.233431,11.233533,2
4,1792,2.877441,-8.902859,3.742589,8.184614,11.927203,2


In [None]:
# X = data.drop(['SalePrice'], axis=1)
# X = data
X = data.drop('Class label', axis=1)
X.head()

Unnamed: 0,User ID,Generation,Demand,Shiftable load,Base load,Consumption
0,252,3.02698,-9.21429,0.0001,12.266109,12.266209
1,379,2.374124,0.247903,0.000289,0.324715,0.325004
2,434,3.015627,-13.828098,0.000105,16.843621,16.843725
3,1718,3.337699,-8.378214,0.000102,11.233431,11.233533
4,1792,2.877441,-8.902859,3.742589,8.184614,11.927203


In [None]:
y = data['Class label']
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Class label, dtype: int64

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((455, 6), (455,), (195, 6), (195,))

In [None]:
# Mutual Information Regressor
# Calculate Mutual Information between each feature and the target
mutual_info = mutual_info_regression(X_train.fillna(0), y_train)
mutual_info

array([0.49961575, 0.24445169, 0.19991799, 0.11108722, 0.1499729 ,
       0.13633108])

In [None]:
# Select K best features
k_percentile_features = SelectPercentile(mutual_info_classif, percentile=80).fit(X_train.fillna(0), y_train)
print('Selected top 10 percentile features: {}'.format(X_train.columns[k_percentile_features.get_support()]))

Selected top 10 percentile features: Index(['User ID', 'Generation', 'Demand', 'Base load'], dtype='object')


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# Step 3: Model Training
# Initialize the machine learning model (Logistic Regression)
model = LogisticRegression(random_state=42)

In [None]:
# Train the model using the training data
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Step 4: Model Evaluation
# Predict the labels for the testing data
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8615384615384616


In [None]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           1       0.91      0.92      0.91       157
           2       0.65      0.63      0.64        38

    accuracy                           0.86       195
   macro avg       0.78      0.77      0.78       195
weighted avg       0.86      0.86      0.86       195



In [None]:
# Make predictions for new data
new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/new data.csv')  # Replace 'new_data.csv' with your new data file

new_data = new_data.drop('Time slot', axis=1)

# Assuming 'new_data' has the same format as your training data (same features, same preprocessing)
# You can preprocess the new data similarly to the training data

# Separate the features from the new data
X_new = new_data.drop('Class label', axis=1)

# Predict the labels for the new data
y_new_pred = model.predict(X_new)

# Interpret the predictions
for i, prediction in enumerate(y_new_pred):
    if prediction == 2:
        print(f"Instance {i+1}: Attack")
    elif prediction == 1:
        print(f"Instance {i+1}: Normal")
    else:
        print(f"Instance {i+1}: Unknown class ({prediction})")

Instance 1: Attack
Instance 2: Normal
Instance 3: Normal
Instance 4: Normal
Instance 5: Normal
Instance 6: Normal
Instance 7: Normal
Instance 8: Normal
Instance 9: Normal
Instance 10: Normal
Instance 11: Normal
Instance 12: Normal
Instance 13: Attack
Instance 14: Normal
Instance 15: Normal
Instance 16: Normal
Instance 17: Normal
Instance 18: Normal
Instance 19: Normal
Instance 20: Normal
Instance 21: Normal
Instance 22: Normal
Instance 23: Normal
Instance 24: Attack
Instance 25: Normal
Instance 26: Normal
Instance 27: Normal
Instance 28: Normal
Instance 29: Normal
Instance 30: Normal
Instance 31: Normal
Instance 32: Normal
Instance 33: Normal
Instance 34: Normal
Instance 35: Normal
Instance 36: Normal
Instance 37: Normal
Instance 38: Normal
Instance 39: Normal
Instance 40: Normal
Instance 41: Normal
Instance 42: Normal
Instance 43: Normal
Instance 44: Normal
Instance 45: Normal
Instance 46: Normal
Instance 47: Normal
Instance 48: Attack
Instance 49: Normal
Instance 50: Normal
Instance 

Model creation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile

In [None]:
def run_randomForest(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy : ')
    print(accuracy_score(y_test, y_pred))

In [None]:
X_train_mi = k_percentile_features.transform(X_train)
X_test_mi = k_percentile_features.transform(X_test)

In [None]:
X_train_mi.shape

(455, 4)

In [None]:
%%time
run_randomForest(X_train_mi, X_test_mi, y_train, y_test)

Accuracy : 
0.958974358974359
CPU times: user 201 ms, sys: 6.74 ms, total: 208 ms
Wall time: 223 ms


In [None]:
(0.634-0.384)*100/0.634

39.43217665615142

In [None]:
model1 = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)

In [None]:
model1.fit(X_train, y_train)

In [None]:
# Make predictions for new data
new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/new data.csv')  # Replace 'new_data.csv' with your new data file

new_data = new_data.drop('Time slot', axis=1)

# Assuming 'new_data' has the same format as your training data (same features, same preprocessing)
# You can preprocess the new data similarly to the training data

# Separate the features from the new data
X_new = new_data.drop('Class label', axis=1)

# Predict the labels for the new data
y_new_pred = model1.predict(X_new)

# Interpret the predictions
for i, prediction in enumerate(y_new_pred):
    if prediction == 2:
        print(f"Instance {i+1}: Attack")
    elif prediction == 1:
        print(f"Instance {i+1}: Normal")
    else:
        print(f"Instance {i+1}: Unknown class ({prediction})")

Instance 1: Normal
Instance 2: Normal
Instance 3: Attack
Instance 4: Normal
Instance 5: Attack
Instance 6: Normal
Instance 7: Attack
Instance 8: Normal
Instance 9: Normal
Instance 10: Normal
Instance 11: Normal
Instance 12: Attack
Instance 13: Attack
Instance 14: Normal
Instance 15: Normal
Instance 16: Normal
Instance 17: Normal
Instance 18: Normal
Instance 19: Normal
Instance 20: Normal
Instance 21: Normal
Instance 22: Normal
Instance 23: Normal
Instance 24: Normal
Instance 25: Normal
Instance 26: Normal
Instance 27: Attack
Instance 28: Attack
Instance 29: Attack
Instance 30: Normal
Instance 31: Attack
Instance 32: Normal
Instance 33: Normal
Instance 34: Attack
Instance 35: Normal
Instance 36: Normal
Instance 37: Normal
Instance 38: Normal
Instance 39: Normal
Instance 40: Normal
Instance 41: Normal
Instance 42: Normal
Instance 43: Normal
Instance 44: Normal
Instance 45: Normal
Instance 46: Normal
Instance 47: Normal
Instance 48: Normal
Instance 49: Normal
Instance 50: Normal
Instance 