In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

database = pd.read_csv('electricity_train.csv')
database_test = pd.read_csv('electricity_reserved.csv')

In [None]:
missing_values = database.isnull().sum()
print(missing_values)

class_distribution = database['class'].value_counts(normalize=True)
print(class_distribution)

In [None]:
corr_matrix = database.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
database.hist(bins=30, figsize=(15, 10))
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X = database.drop('class', axis=1)
y = database['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)


In [None]:
model = RandomForestClassifier(random_state=11)
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
train_report = classification_report(y_train, y_pred, digits=3)
print(train_report)

accuracy = classification_report(y_train, y_pred, output_dict=True)['accuracy']
print(accuracy)

In [None]:
from sklearn.model_selection import GridSearchCV

params_grid = {'n_estimators': [100,300,500],
  'max_leaf_nodes': list(range(6,10)),
  'min_samples_leaf': [1,2,3]}

grid_search = GridSearchCV(
    RandomForestClassifier(
        bootstrap=False,
        class_weight='balanced',
        n_jobs=-1,
        max_features='sqrt',
        random_state=11
    ),
    param_grid=params_grid,
    verbose=1,
    cv=3
)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

In [None]:
print(best_params['max_leaf_nodes'])
print(best_params['min_samples_leaf'])
print(best_params['n_estimators'])

In [None]:
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_train)

train_report_best = classification_report(y_train, y_pred_best, digits=3)
print(train_report_best)

accuracy_best = classification_report(y_train, y_pred_best, output_dict=True)['accuracy']


y_pred_best = best_model.predict(X_test)

train_report_best = classification_report(y_test, y_pred_best, digits=3)
print(train_report_best)

accuracy_best = classification_report(y_test, y_pred_best, output_dict=True)['accuracy']

In [None]:
importances = best_model.feature_importances_
feature_names = X.columns

most_important_feature = feature_names[importances.argmax()]
print(most_important_feature)

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier


log_clf = LogisticRegression(solver='liblinear', random_state=11)
svc_clf = SVC(random_state=11)
sgd_clf = SGDClassifier(random_state=11)

voting_clf = VotingClassifier(
    estimators=[
        ('lr', log_clf),
        ('svc', svc_clf),
        ('sgd', sgd_clf)
    ],
    voting='hard'
)

voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)
test_report = classification_report(y_test, y_pred, digits=3)
print(test_report)

accuracy = classification_report(y_test, y_pred, output_dict=True)['accuracy']
print(accuracy)
print(list(voting_clf.predict(database_test)))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report

dt_clf = DecisionTreeClassifier(class_weight='balanced', random_state=11)

bagging_clf = BaggingClassifier(
    estimator=dt_clf,
    max_samples=0.5,
    max_features=0.5,
    bootstrap=False,
    random_state=11
)

bagging_clf.fit(X_train, y_train)

y_pred = bagging_clf.predict(X_test)

test_report = classification_report(y_test, y_pred, digits=3)
print(test_report)
accuracy = classification_report(y_test, y_pred, output_dict=True)['accuracy']
print(accuracy)
print(list(bagging_clf.predict(database_test)))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

gb_clf = GradientBoostingClassifier(
    n_estimators=500, 
    learning_rate=0.8, 
    random_state=11, 
    max_depth=2
)

gb_clf.fit(X_train, y_train)

y_pred = gb_clf.predict(X_test)

test_report = classification_report(y_test, y_pred, digits=3)
print(test_report)

accuracy_test = classification_report(y_test, y_pred, output_dict=True)['accuracy']
print(accuracy_test)
print(list(gb_clf.predict(database_test)))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

dt_clf = DecisionTreeClassifier(max_depth=3, class_weight='balanced', random_state=11)

ada_clf = AdaBoostClassifier(
    estimator=dt_clf,
    n_estimators=300,
    learning_rate=0.5,
    random_state=11
)

ada_clf.fit(X_train, y_train)

y_pred = ada_clf.predict(X_test)

test_report = classification_report(y_test, y_pred, digits=3)
print(test_report)

accuracy_test = classification_report(y_test, y_pred, output_dict=True)['accuracy']
print(accuracy_test)
print(list(ada_clf.predict(database_test)))

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

rf_clf = RandomForestClassifier(random_state=11)
svc_clf = SVC(random_state=11)

stacking_clf = StackingClassifier(
    estimators=[('rf', rf_clf), ('svc', svc_clf)],
    final_estimator=LogisticRegression(random_state=11)
)

stacking_clf.fit(X_train, y_train)

y_pred = stacking_clf.predict(X_test)

test_report = classification_report(y_test, y_pred, digits=3)
print(test_report)

accuracy_test = classification_report(y_test, y_pred, output_dict=True)['accuracy']
print(accuracy_test)
print(list(stacking_clf.predict(database_test)))