In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight
from sklearn.model_selection import validation_curve
from sklearn.model_selection import KFold
from sklearn import preprocessing
np.random.seed(0)
import sklearn.model_selection
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
plt.style.use('ggplot')

In [None]:
df = pd.read_csv('/kaggle/input/residential-power-usage-3years-data-timeseries/power_usage_2016_to_2020.csv')
df1 = pd.read_csv('/kaggle/input/residential-power-usage-3years-data-timeseries/weather_2016_2020_daily.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['StartDate'] = pd.to_datetime(df['StartDate'])

In [None]:
df['year'] = df['StartDate'].dt.year
df['month'] = df['StartDate'].dt.month
df['week'] = df['StartDate'].dt.week
df['day'] = df['StartDate'].dt.day
df['hour'] = df['StartDate'].dt.hour
#df['minute'] = df['StartDate'].dt.minute
#df['seconds']= df['StartDate'].dt.second

In [None]:
df.drop('StartDate',axis=1,inplace=True)

In [None]:
X=df.drop('notes',axis=1)
y=df['notes']

In [None]:
X.head(),y.head()

### Label Encoding of Target Variable

In [None]:
X=df.drop('notes',axis=1)
y=df['notes']

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_true = pd.Series(le.fit_transform(y))

In [None]:
df['notes'].value_counts(normalize=True)

In [None]:
# summarize distribution
counter = Counter(y_true)
for k,v in counter.items():
    per = v / len(y_true) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))
# plot the distribution
plt.bar(counter.keys(), counter.values())
plt.show()

from the above plot it is clear that the data is imbalanced.

### Random Forest Classification

Random Forest is an ensemble of decision trees. The single decision tree is very sensitive to data variations. It can easily overfit to noise in the data. The Random Forest with only one tree will overfit to data as well because it is the same as a single decision tree. When we add trees to the Random Forest then the tendency to overfitting should decrease (thanks to bagging and random feature selection). However, the generalization error will not go to zero. The variance of generalization error will approach to zero with more trees added but the bias will not! It is a useful feature, which tells us that the more trees in the RF the better

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.4, random_state=1,stratify=y)

There are two important things I want to point out in the code above. First is that I set a random_state; this ensures that if I have to rerun my code, I’ll get the exact same train-test split, so my results won’t change
The second thing I want to point out is stratify=y. This tells train_test_split to make sure that the training and test datasets contain examples of each class in the same proportions as in the original dataset.

In [None]:
y_train.value_counts(normalize=True)

#### Encoding the labels

In [None]:
le = preprocessing.LabelEncoder()
y_train = pd.Series(le.fit_transform(y_train))
y_test = pd.Series(le.fit_transform(y_test))

#### Evaluate Model with cross validation

In [None]:
# evaluate a model
def evaluate_model(X, y, model):
	# define evaluation procedure
	cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
	# evaluate model
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores

In [None]:
# define the reference model
model = RandomForestClassifier()
# evaluate the model
scores = evaluate_model(X_train, y_train, model)
# summarize performance
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [None]:
# Make predictions for the test set
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)

In [None]:
# View accuracy score
accuracy_score(y_test, y_pred_test)

Accuracy from cross validation and general accuracy arethe same.This model has an accuracy score of 99.95% on the test data. That seems pretty impressive, but remember that accuracy is not a great measure of classifier performance when the classes are imbalanced

### Confusion matrix

In [None]:
# View confusion matrix for test data and predictions
confusion_matrix(y_test, y_pred_test)

This confusion matrix would be a lot easier to read if it had some labels and even a color scale to help us spot the biggest and smallest values

In [None]:
# Get and reshape confusion matrix data
matrix = confusion_matrix(y_test, y_pred_test)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot
class_names = ['weekday', 'weekend', 'COVID_lockdown', 
               'vacation']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()

Now it’s easy to see that our classifier struggled at predicting the weekend label

### Classification report

In [None]:
# View the classification report for test data and predictions
print(classification_report(y_test, y_pred_test))

### Balancing Data for Classification

The purpose is to avoid SMOTE.
SMOTE is not very good for high dimensionality data
Overlapping of classes may happen and can introduce more noise to the data.
So, to skip this problem, we can assign weights for the class manually with the ‘class_weight’ parameter.
Below processes do the same

### Sklearn Utils

We can get class weights using sklearn to compute the class weight. By adding those weight to the minority classes while training the model, can help the performance while classifying the classes.

In [None]:
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced',np.unique(y_train),y_train)

In [None]:
class_weight

In [None]:
class_list = [0,1,2,3]
zip_iterator = zip(class_list, class_weight)
a_dictionary = dict(zip_iterator)
print(a_dictionary)

In [None]:
# define the reference model
model = RandomForestClassifier(class_weight = a_dictionary)
# evaluate the model
scores = evaluate_model(X_train, y_train, model)
# summarize performance
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [None]:
# Make predictions for the test set
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)

In [None]:
# View accuracy score
accuracy_score(y_test, y_pred_test)

### Counts to Length Ratio:

Dividing the no. of counts of each class with the no. of rows

In [None]:
weights = y_true.value_counts()/len(y_true)
class_list = [0,1,2,3]
zip_iterator = zip(class_list, weights)
a_dictionary = dict(zip_iterator)
print(a_dictionary)

In [None]:
# define the reference model
model = RandomForestClassifier(class_weight = a_dictionary)
# evaluate the model
scores = evaluate_model(X_train, y_train, model)
# summarize performance
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [None]:
# Make predictions for the test set
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)

In [None]:
# View accuracy score
accuracy_score(y_test, y_pred_test)

### Smoothen Weights Technique

This is one of the preferable methods of choosing weights.
labels_dict is the dictionary object contains counts of each class.
The log function smooths the weights for the imbalanced class.

In [None]:
mu=0.15
# random labels_dict
labels_dict = y_true.value_counts().to_dict()
total = sum(labels_dict.values())
keys = labels_dict.keys()
weight = dict()
weight

for i in keys:
    score = np.log(mu*total/float(labels_dict[i]))
    weight[i] = score if score > 1 else 1
    
weight

In [None]:
# define the reference model
model = RandomForestClassifier(class_weight = weight)
# evaluate the model
scores = evaluate_model(X_train, y_train, model)
# summarize performance
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [None]:
# Make predictions for the test set
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)

In [None]:
# View accuracy score
accuracy_score(y_test, y_pred_test)