In [1]:
%%time
import time

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None
import logging
from sklearn.base import clone
import category_encoders as ce
import importlib
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from IPython.display import display, HTML
import random
import matplotlib.pyplot as plt
import numpy as np
import time
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from scipy.stats import beta
from scipy.stats import skew
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.base import clone
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

random_state= 50


CPU times: user 2.39 s, sys: 7.82 s, total: 10.2 s
Wall time: 521 ms


In [2]:
#global constants 
n_splits = 5

In [3]:
def get_types(X):
    categorical = []
    ordinal = []
    ordinal = X.select_dtypes('number').columns.to_list()
    categorical = set(X.columns.to_list()) - set(ordinal)
    return categorical,ordinal


def normalize(X1):
    result = X1.copy()
    for feature_name in X1.columns:
        max_value = X1[feature_name].max()
        min_value = X1[feature_name].min()
        result[feature_name] = X1[feature_name] / (max_value - min_value)
    return result


def preprocess_data(X_train, X_test, y_train, kappa, ordinal_columns):
    X_train, X_test = handle_categorical(X_train, X_test, y_train, ordinal_columns)
    X_train, X_test = handle_ordinal(X_train, X_test, y_train, kappa, ordinal_columns)
    return X_train, X_test


def handle_categorical(X_train, X_test, y_train, ordinal_columns):
    means = {}

    for col in X_train.columns:
        if col not in ordinal_columns:  # Categorical variable
            unique_values_train = X_train[col].unique()
            unique_values_test = X_test[col].unique()

            for val in unique_values_train:
                mean = y_train[X_train[col] == val].mean()
                means[(col, val)] = mean
#                 print(f"Column: {col}, Value: {val}, Mean: {mean}")

            for val in unique_values_test:
                if val not in unique_values_train:
                    print(f"Warning: Value {val} in column {col} of X_test is not in X_train.")

    for col, val in means:
        X_train.loc[X_train[col] == val, col] = means[(col, val)]
        X_test.loc[X_test[col] == val, col] = means[(col, val)]

    return X_train, X_test


import numpy as np

def handle_ordinal(X_train, X_test, y_train, kappa, ordinal_columns):
    missing_values = {}

    for col in ordinal_columns:
        unique_values_train = X_train[col].unique()
        unique_values_test = X_test[col].unique()

        y_train_np = y_train.to_numpy()
        X_train_col_np = X_train[col].to_numpy()

        for val in unique_values_test:
            X_test_col_val = val
            distances = np.abs(X_test_col_val - X_train_col_np)
            weights = 1 / ((1 + distances) ** kappa)
            l = np.sum(y_train_np * weights)
            v = np.sum(weights)
            imputed_value = l / v
            missing_values[(col, val)] = imputed_value

        for val in set(unique_values_train) - set(unique_values_test):
            X_train_col_val = val
            distances = np.abs(X_train_col_val - X_train_col_np)
            weights = 1 / ((1 + distances) ** kappa)
            l = np.sum(y_train_np * weights)
            v = np.sum(weights)
            imputed_value = l / v
            missing_values[(col, val)] = imputed_value

    for (col, val), imputed_value in missing_values.items():
        X_train.loc[X_train[col] == val, col] = imputed_value
        X_test.loc[X_test[col] == val, col] = imputed_value



    return X_train, X_test




def prop_reg(X_test,X_train,y_train,y_test,kappa):


    d = np.linalg.norm(X_test[:, None] - X_train, axis=2)
    weights = 1 / (1 + d) ** kappa
    c = np.sum(y_train * weights, axis=1) / np.sum(weights, axis=1)

    return(c)


def kfold_cv(X, y, ordinal_columns, kappa_values_preprocessing, kappa_values_prop_reg, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=0)
    results = {}
    mae_values_per_kappa = {}

    for kappa_preprocessing in kappa_values_preprocessing:
        for kappa_prop_reg in kappa_values_prop_reg:
           # kappa_prop_reg = kappa_preprocessing #short curcuit kapper_preprocess
            mae_values = []

            for train_index, test_index in kf.split(X):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                X_train_preprocessed, X_test_preprocessed = preprocess_data(X_train, X_test, y_train, kappa_preprocessing, ordinal_columns)
                y_pred = prop_reg(X_test_preprocessed.values, X_train_preprocessed.values, y_train.values, y_test.values, kappa_prop_reg)
                mae = mean_absolute_error(y_test, y_pred)
                mae_values.append(mae)

            results[(kappa_preprocessing, kappa_prop_reg)] = np.mean(mae_values)
            mae_values_per_kappa[(kappa_preprocessing, kappa_prop_reg)] = mae_values

    return results, mae_values_per_kappa

def plots(kappa_values_prop_reg,kappa_values_preprocessing,mae_values_per_kappa):
# Plot the MAE vs kappa curve for preprocessing
    plt.figure(figsize=(10, 5))
    for kappa_prop_reg in kappa_values_prop_reg:
        mae_values = [np.mean(mae_values_per_kappa[(kappa_preprocessing, kappa_prop_reg)]) for kappa_preprocessing in kappa_values_preprocessing]
        plt.plot(kappa_values_preprocessing, mae_values, '-o', label=f"Prop_reg kappa: {kappa_prop_reg}")

    plt.xlabel('Preprocessing kappa')
    plt.ylabel('MAE')
    plt.title('MAE vs Preprocessing kappa')
    plt.legend()
    plt.show()

    # Plot the MAE vs kappa curve for prop_reg
    plt.figure(figsize=(10, 5))
    for kappa_preprocessing in kappa_values_preprocessing:
        mae_values = [np.mean(mae_values_per_kappa[(kappa_preprocessing, kappa_prop_reg)]) for kappa_prop_reg in kappa_values_prop_reg]
        plt.plot(kappa_values_prop_reg, mae_values, '-o', label=f"Preprocessing kappa: {kappa_preprocessing}")

    plt.xlabel('Prop_reg kappa')
    plt.ylabel('MAE')
    plt.title('MAE vs Prop_reg kappa')
    plt.legend()
    plt.show()
    return

    
def optimal_kappa(cv_results):
    # Find the optimal combination of kappa values

    '''
    {(kappa_inner,kapper_outer : score),}
    '''
    optimal_kappa_preprocessing, optimal_kappa_prop_reg = min(cv_results, key=cv_results.get)
    optimal_mae = cv_results[(optimal_kappa_preprocessing, optimal_kappa_prop_reg)]

    print(f"Optimal Preprocessing kappa: {optimal_kappa_preprocessing}")
    print(f"Optimal Prop_reg kappa: {optimal_kappa_prop_reg}")
    print(f"Minimum MAE: {optimal_mae}")
    print('completed')
    return optimal_kappa_preprocessing,optimal_kappa_prop_reg,optimal_mae


def optimal_kappa_early_stopping(cv_results):
    # Find the optimal combination of kappa values using early stopping
    '''
    {(kappa_inner,kapper_outer : score),}
    '''
    early_stopping = []
    optimal_kappa_preprocessing, optimal_kappa_prop_reg = min(cv_results, key=cv_results.get)
    optimal_mae = cv_results[(optimal_kappa_preprocessing, optimal_kappa_prop_reg)]
    min_score = 0
    last_item = 100000
    for item in cv_results.items():
        if item[0][0] == optimal_kappa_preprocessing:
            if (last_item - item[1]) < 0.1:
                    optimal_kappa_prop_reg =item[0][1]
                    break
            last_item = item[1]
    print(f"Optimal Preprocessing kappa: {optimal_kappa_preprocessing}")
    print(f"Optimal Prop_reg kappa: {optimal_kappa_prop_reg}")
    print(f"Minimum MAE: {optimal_mae}")
    print('completed')
    return optimal_kappa_preprocessing,optimal_kappa_prop_reg,optimal_mae



def find_kappa(X, y, ordinal_columns, kappa_values_preprocessing, kappa_values_prop_reg, n_splits=10):
    cv_results, mae_values_per_kappa = kfold_cv(X, y, ordinal_columns, kappa_values_preprocessing, kappa_values_prop_reg, n_splits=n_splits)
    a,b,c = optimal_kappa(cv_results)
    plots(kappa_values_prop_reg,kappa_values_preprocessing,mae_values_per_kappa)
    return a,b,c


    
def remove_outliers_zscore(df, threshold=3):

    z_scores = np.abs((df - df.mean()) / df.std())
    outlier_rows = z_scores.apply(lambda row: any(row > threshold), axis=1)
    cleaned_df = df[~outlier_rows]

    return cleaned_df



def pearson_second_skewness(df, column_name):
    column = df[column_name]
    skewness = skew(column)
    n = len(column)
    skewness_p2 = (3 * skewness * (n - 1)**0.5) / (n - 2)
    return skewness_p2

def label_encode_columns(data, columns):
   

    for column_name in columns:
        le = LabelEncoder()
        data[column_name] = le.fit_transform(data[column_name])
        

    return data

In [4]:
'''
===============config================
'''
data = pd.read_csv('energy_data.csv')
data = data.dropna()
drop_data = [] # use this to drop any values
drop_columns = ['Y2'] #use this to drop columns
target_column = 'Y1'
prescaling = True


'''
===============cleaning================
'''

for drop in drop_data:
    data = data[~(data == drop).any(axis=1)]
    
if drop_columns != []: 
    data.drop(drop_columns, axis=1, inplace=True)
    
if target_column == -1:
    y = data.iloc[:,-1]
else:
    y = data[target_column]

'''
===============modeling================
'''

X = data.drop(y.name, axis = 1)
categorical_columns, ordinal_columns = get_types(X)

X = label_encode_columns(X, categorical_columns)
print(X.head())


     X1     X2     X3      X4   X5  X6   X7  X8
0  0.98  514.5  294.0  110.25  7.0   2  0.0   0
1  0.98  514.5  294.0  110.25  7.0   3  0.0   0
2  0.98  514.5  294.0  110.25  7.0   4  0.0   0
3  0.98  514.5  294.0  110.25  7.0   5  0.0   0
4  0.90  563.5  318.5  122.50  7.0   2  0.0   0


In [5]:
n_splits=5
kf = KFold(n_splits=n_splits, random_state=0, shuffle=True)

In [6]:
models = {
    'Random Forest': RandomForestRegressor(random_state=50),
    'Decision Tree': DecisionTreeRegressor(random_state=50),
    'K-Nearest Neighbors': KNeighborsRegressor(n_neighbors=10),
    'XGBoost': xgb.XGBRegressor(random_state=50),
    'Multiple Linear Regression': LinearRegression(),
    'SVM': SVR()
}

kf = KFold(n_splits=n_splits) 

for model_name, model in models.items():
    mae_list = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mae_list.append(mean_absolute_error(y_test, y_pred))
    
    print(f"{model_name} kf MAE:", np.mean(mae_list))


Random Forest kf MAE: 0.7901079416008827
Decision Tree kf MAE: 0.8200913504795858
K-Nearest Neighbors kf MAE: 2.995339653679654
XGBoost kf MAE: 0.7181546597808379
Multiple Linear Regression kf MAE: 2.281574055976061
SVM kf MAE: 4.167284237372681


In [7]:
models = {
    'Random Forest': RandomForestRegressor(random_state=50),
    'Decision Tree': DecisionTreeRegressor(random_state=50),
    'K-Nearest Neighbors': KNeighborsRegressor(n_neighbors=10),
    'XGBoost': xgb.XGBRegressor(random_state=50),
    'Multiple Linear Regression': LinearRegression(),
    'SVM': SVR()
}

kf = KFold(n_splits=n_splits)

scaler = MinMaxScaler()

for model_name, model in models.items():
    mae_list = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        
        X_train_normalized = scaler.fit_transform(X_train)

        X_test_normalized = scaler.transform(X_test)

        model.fit(X_train_normalized, y_train)
        y_pred = model.predict(X_test_normalized)

        mae_list.append(mean_absolute_error(y_test, y_pred))
    
    print(f"{model_name} kf MAE:", np.mean(mae_list))







Random Forest kf MAE: 0.7833948238689413
Decision Tree kf MAE: 0.816221220609456
K-Nearest Neighbors kf MAE: 2.28728253968254
XGBoost kf MAE: 0.7181546597808379
Multiple Linear Regression kf MAE: 2.9784328831999347
SVM kf MAE: 2.2886252254775563


In [8]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


pca = PCA()
pca.fit(X_scaled)

cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
num_components = np.where(cumulative_variance >= 0.95)[0][0] + 1  

print(f"Number of components that explain 95% variance: {num_components}")

Number of components that explain 95% variance: 5


In [9]:
models = {
    'Random Forest': RandomForestRegressor(random_state=50),
    'Decision Tree': DecisionTreeRegressor(random_state=50),
    'K-Nearest Neighbors': KNeighborsRegressor(n_neighbors=10),
    'XGBoost': xgb.XGBRegressor(random_state=50),
    'Multiple Linear Regression': LinearRegression(),
    'SVM': SVR()
}

kf = KFold(n_splits=5) 



pca = PCA(n_components=5)

# Apply PCA on the scaled data
X_pca = pca.fit_transform(X_scaled)
for model_name, model in models.items():
    mae_list = []
    for train_index, test_index in kf.split(X_pca):
        X_train, X_test = X_pca[train_index], X_pca[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mae_list.append(mean_absolute_error(y_test, y_pred))
    
    print(f"{model_name} kf MAE:", np.mean(mae_list))


Random Forest kf MAE: 2.1798008904167725
Decision Tree kf MAE: 2.722109175791529
K-Nearest Neighbors kf MAE: 2.1340282998047706
XGBoost kf MAE: 1.918178590916782
Multiple Linear Regression kf MAE: 2.7105793896339967
SVM kf MAE: 2.333242799936622


In [None]:
kappa_values = np.arange(0,10,0.05)  
k_values = range(1,15,1)


n_splits=5
kf = KFold(n_splits=n_splits, random_state=0, shuffle=True)

best_mae = float('inf')
best_k = None
best_kappa = None

for kappa in kappa_values:
    for k in k_values:
        mae_list = []
        
        for train_index, val_index in kf.split(X):
            X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
            y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
            
            X_train_fold, X_val_fold = preprocess_data(X_train_fold, X_val_fold, y_train_fold, kappa, ordinal_columns)
            #print("Preprocessed X_train_fold:\n", X_train_fold.head())
            #print("\nPreprocessed X_val_fold:\n", X_val_fold.head())
            #print("NaN values in X_train_fold:\n", X_train_fold.isnull().sum())
            #print("\nNaN values in X_val_fold:\n", X_val_fold.isnull().sum())

            
            model = KNeighborsRegressor(n_neighbors=k)
            model.fit(X_train_fold, y_train_fold)
            y_pred = model.predict(X_val_fold)
            
            mae_list.append(mean_absolute_error(y_val_fold, y_pred))
        
        avg_mae = np.mean(mae_list)
        if avg_mae < best_mae:
            best_mae = avg_mae
            best_k = k
            best_kappa = kappa

print(f"Best Average MAE: {best_mae}, Best k: {best_k}, Best kappa: {best_kappa}")


In [None]:


models = [
    (RandomForestRegressor(), "Random Forest"),
    (DecisionTreeRegressor(), "Decision Tree"),
    (SVR(kernel='linear'), "SVM"),
    (LinearRegression(), "Linear Regression"),
    (xgb.XGBRegressor(), "XGBoost")
]

kappa_values = np.arange(0, 10, 0.05)

kf = KFold(n_splits=n_splits, random_state=0, shuffle=True)

for model, model_name in models:
    best_mae = float('inf')
    best_kappa = None
    
    for kappa in kappa_values:
        mae_list = []
        
        for train_index, val_index in kf.split(X):
            X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
            y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
            
            X_train_fold, X_val_fold = preprocess_data(X_train_fold, X_val_fold, y_train_fold, kappa, ordinal_columns)
            
            model.fit(X_train_fold, y_train_fold)
            y_pred = model.predict(X_val_fold)
            
            mae_list.append(mean_absolute_error(y_val_fold, y_pred))
        
        avg_mae = np.mean(mae_list)
        if avg_mae < best_mae:
            best_mae = avg_mae
            best_kappa = kappa

    print(f"{model_name} - Best Average MAE: {best_mae}, Best kappa: {best_kappa}")
