In [1]:
# Libraries for data analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

# Linear Regression 

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

# Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

# Neural Networks

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras import regularizers
from itertools import product


---

<h1><center>Data Pre-Processing

---

In [2]:
# Download data
df = pd.read_csv('ev_battery_charging_data.csv')

In [3]:
# Drop rows with any NaN values
df_clean = df.dropna()

# Shape after cleaning
print("\nAfter removing rows with NaN:")
print(f"Rows: {df_clean.shape[0]}, Columns: {df_clean.shape[1]}")


After removing rows with NaN:
Rows: 1000, Columns: 13


In [4]:
# Drop inputs calculated from the column we are predicting
drop_cols = ['Degradation Rate (%)', 'Efficiency (%)', 'Optimal Charging Duration Class']
df_clean = df_clean.drop(columns=drop_cols)

In [5]:
# List the categorical columns and numerical columns for one-hot encoding later
cat_col = ['Charging Mode', 'Battery Type', 'EV Model']
num_col = ['SOC (%)', 'Voltage (V)', 'Current (A)', 'Battery Temp (°C)', 'Ambient Temp (°C)', 'Charging Cycles']
target = 'Charging Duration (min)'

In [6]:
# Move the column we are predicting to the end
df_clean[target] = df_clean.pop(target)

In [7]:
# Split the data into test and train
from sklearn.model_selection import train_test_split
X = df_clean.drop(columns=target)
y = df_clean[target]
Xtrain_valid, Xtest, ytrain_valid, ytest = train_test_split(X, y, test_size=0.2,random_state=69)

# Split the data into train, validate, test

Xtrain, Xvalid, ytrain, yvalid = train_test_split(Xtrain_valid, ytrain_valid, test_size=0.125,random_state=69)

In [8]:
df_clean.head()

Unnamed: 0,SOC (%),Voltage (V),Current (A),Battery Temp (°C),Ambient Temp (°C),Charging Mode,Battery Type,Charging Cycles,EV Model,Charging Duration (min)
0,43.708611,3.629593,33.553512,33.45406,26.439918,Fast,Li-ion,112,Model B,59.363552
1,95.564288,3.879331,32.228092,35.933628,31.108647,Fast,LiFePO4,398,Model A,67.343566
2,75.879455,4.111062,91.562912,25.009358,30.203219,Slow,LiFePO4,175,Model A,105.454739
3,63.879264,4.012557,32.459158,32.497482,18.077998,Fast,LiFePO4,150,Model B,54.000439
4,24.041678,4.064593,34.475475,31.43492,17.984989,Fast,Li-ion,886,Model C,106.964968


In [9]:
df_clean.describe(include='all')

Unnamed: 0,SOC (%),Voltage (V),Current (A),Battery Temp (°C),Ambient Temp (°C),Charging Mode,Battery Type,Charging Cycles,EV Model,Charging Duration (min)
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000,1000,1000.0,1000,1000.0
unique,,,,,,3,2,,3,
top,,,,,,Slow,LiFePO4,,Model C,
freq,,,,,,341,503,,343,
mean,54.12309,3.854912,55.216515,29.807501,24.882106,,,556.56,,69.846449
std,26.292363,0.204533,26.160678,5.729787,5.736199,,,263.763052,,28.919481
min,10.416882,3.502253,10.001047,20.013068,15.000614,,,101.0,,20.618382
25%,31.237594,3.668752,33.521588,24.838015,19.89948,,,317.75,,44.938
50%,54.712664,3.863114,55.055253,29.685586,24.891962,,,571.0,,69.040282
75%,76.988763,4.032326,78.319317,34.750816,29.799869,,,786.0,,93.985178


In [10]:
# df_clean[target].plot(title='Charge Duration', xlabel='Index', ylabel='Time (min)')


In [11]:
# Preprocessor for numerical columns
num_transformer = Pipeline(
    steps=[("scaler", StandardScaler())])

# Preprocessor for categorical columns
cat_transformer = Pipeline(
    steps=[("encoder", OneHotEncoder(drop='first'))])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_col),
        ('cat', cat_transformer, cat_col)])

In [12]:
Xtrain_norm = preprocessor.fit_transform(Xtrain)
Xvalid_norm = preprocessor.transform(Xvalid)
Xtest_norm = preprocessor.transform(Xtest)
feature_names_x = preprocessor.get_feature_names_out()
Xtrain_norm_df = pd.DataFrame(Xtrain_norm, columns=feature_names_x, index=Xtrain.index)
Xvalid_norm_df = pd.DataFrame(Xvalid_norm, columns=feature_names_x, index=Xvalid.index)
Xtest_norm_df = pd.DataFrame(Xtest_norm, columns=feature_names_x, index=Xtest.index)

ytrain_df = pd.DataFrame(ytrain)
yvalid_df = pd.DataFrame(yvalid)
ytest_df = pd.DataFrame(ytest)

---

<h1><center>Linear Regression Models

---

In [13]:
# Linear regression with feature selection
linreg_pipe = Pipeline([('preprocessor', preprocessor), 
                     ('linreg', LinearRegression())])

# Fit the model
linreg_pipe.fit(Xtrain, ytrain)

0,1,2
,steps,"[('preprocessor', ...), ('linreg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [14]:
# Linear Regression R2
pred = linreg_pipe.predict(Xtest)
r2_lasso = np.max(r2_score(ytest, pred))
print(r2_lasso)

-0.025478521235293483


In [15]:
# Lasso regression
alphas = np.logspace(-5, 5, 10)
lasso_models = []

for alpha in alphas:
    lasso_pipe = Pipeline([('preprocessor', preprocessor), 
                         ('lasso', Lasso(alpha=alpha))])
    lasso_pipe.fit(Xtrain, ytrain)
    lasso_models.append(lasso_pipe)

In [16]:
# Lasso Regression R2
pred_lasso = []
r2_lasso = []

for i in range(len(alphas)):
    pred_lasso.append(lasso_models[i].predict(Xtest))
    r2_lasso.append(r2_score(ytest, pred_lasso[i]))

r2_lasso_max = max(r2_lasso)
print(r2_lasso_max)
print(np.argmax(r2_lasso))

-0.007678774539534805
5


<h2><center>Logistic Regression Comparison

In [20]:
print("Before running models, note that a random guess would\nresult in about 33% accuracy for 3 classes.")
# 1. Prepare Data for Logistic Regression
# We go back to the original df to get the 'Optimal Charging Duration Class' column
df_log = df.dropna().copy()
target_class = 'Optimal Charging Duration Class'

# Drop columns we don't want as features
drop_cols_log = ['Degradation Rate (%)', 'Efficiency (%)', 'Charging Duration (min)']
X_log = df_log.drop(columns=drop_cols_log + [target_class])
y_log = df_log[target_class]

# Split the data
Xtrain_log, Xtest_log, ytrain_log, ytest_log = train_test_split(X_log, y_log, test_size=0.2, random_state=69)

# 2. Create the Logistic Regression Pipeline
logreg_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(max_iter=1000)) # Increased max_iter to ensure convergence
])

# 3. Fit the model
logreg_pipe.fit(Xtrain_log, ytrain_log)

# 4. Evaluate
# For classification, we use Accuracy (percentage of correct guesses)
train_score = logreg_pipe.score(Xtrain_log, ytrain_log)
test_score = logreg_pipe.score(Xtest_log, ytest_log)

print(f"\nStandard Logistic - Train: {train_score:.2%} | Test: {test_score:.2%}")

print("\n--- Advanced Logistic Models Comparison ---")

# Model 1: Balanced Logistic Regression
logreg_balanced = Pipeline([
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(max_iter=2000, class_weight='balanced'))
])
logreg_balanced.fit(Xtrain_log, ytrain_log)
train_acc_balanced = logreg_balanced.score(Xtrain_log, ytrain_log)
acc_balanced = logreg_balanced.score(Xtest_log, ytest_log)
print(f"1. Balanced Logistic - Train: {train_acc_balanced:.2%} | Test: {acc_balanced:.2%}")

# Model 2: Polynomial Logistic Regression (Degrees 2-5)
print("\n2. Testing Polynomial Degrees (2-5)...")
degrees_to_test = sorted({2, 3, 4, 5})

for d in degrees_to_test:
    poly_log_pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('poly', PolynomialFeatures(degree=d)),
        ('logreg', LogisticRegression(max_iter=2000))
    ])
    
    try:
        poly_log_pipe.fit(Xtrain_log, ytrain_log)
        train_acc = poly_log_pipe.score(Xtrain_log, ytrain_log)
        acc = poly_log_pipe.score(Xtest_log, ytest_log)
        print(f"   Degree {d} - Train: {train_acc:.2%} | Test: {acc:.2%}")
        
    except Exception as e:
        print(f"   Degree {d} Failed: {e}")

Before running models, note that a random guess would
result in about 33% accuracy for 3 classes.

Standard Logistic - Train: 43.25% | Test: 39.00%

--- Advanced Logistic Models Comparison ---
1. Balanced Logistic - Train: 40.38% | Test: 33.50%

2. Testing Polynomial Degrees (2-5)...
   Degree 2 - Train: 49.88% | Test: 41.00%
   Degree 3 - Train: 75.62% | Test: 39.50%
   Degree 4 - Train: 100.00% | Test: 40.00%
   Degree 5 - Train: 100.00% | Test: 42.50%


---

<h1><center>Neural Networks

---

Plan:
1. Build a basic NN to get results
    a. Build pipeline
    b. Build NN model
    c. Compile NN w/ pipeline
    d. Calculate R2 score and compare with linreg
2. Build function to build pipelines 

In [None]:
# Build an MLP

ki = GlorotUniform(seed=2434)

model_mlp = Sequential([
    Dense(64, input_shape=(Xtrain_norm_df.shape[1],), activation='relu', kernel_initializer=ki),
    Dense(32, activation='relu', kernel_initializer=ki),
    Dense(16, activation='relu', kernel_initializer=ki),
    Dense(1, kernel_initializer=ki)
])

model_mlp.compile( optimizer="rmsprop", loss="mse", metrics=["r2_score"] )

In [None]:
# Train the MLP

history_mlp = model_mlp.fit(x=Xtrain_norm_df, y=ytrain_df, epochs=50, validation_data=(Xvalid_norm_df,yvalid_df))

In [None]:
mlp_perf = r2_score(ytrain_df, model_mlp.predict(Xtrain_norm_df))
mlp_perf_valid = r2_score(yvalid_df, model_mlp.predict(Xvalid_norm_df))
print(mlp_perf)
print(mlp_perf_valid)

In [None]:
def build_model(num_layers, lr, x_data, last_reg_layer):

    num_units = np.zeros(num_layers, dtype=int)
    num_units[-1] = 1
    if num_layers > 1:
        for i in range(num_layers-1):
            num_units[i] = 2**(num_layers+1-i)

    layers = []
    ki = GlorotUniform(seed=2434)
    layers.append(Dense(num_units[0], input_shape=(x_data.shape[1],), activation='relu', kernel_initializer=ki))
    for i in range(1, num_layers-1):
        layers.append(Dense(num_units[i], activation='relu', kernel_initializer=ki))
    layers.append(Dense(1, kernel_initializer=ki, kernel_regularizer=last_reg_layer))


    model = Sequential(layers)

    optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr)

    model.compile( optimizer=optimizer, loss="mse", metrics=["r2_score"] )

    return model

In [None]:
param_grid = {
    'num_layers': [2, 3, 4, 5],
    'lr': [1e-3, 1e-2, 1e-1],
    'last_reg_layer': [None, regularizers.l1(1e-2),
                       regularizers.l1(1e-1),
                       regularizers.l2(1e-2), regularizers.l2(1e-1)],
    'epochs': [10, 25, 50]
}


In [None]:
all_combos = list(product(
    param_grid['num_layers'],
    param_grid['lr'],
    param_grid['last_reg_layer'],
    param_grid['epochs']   
))

In [None]:
def reg_to_str(reg):
    if reg is None:
        return "None"
    elif hasattr(reg, 'l1'):
        return f"L1({reg.l1:.3f})"   # rounds to 3 decimals
    elif hasattr(reg, 'l2'):
        return f"L2({reg.l2:.3f})"
    else:
        return str(reg)


In [None]:
results = []

for num_layers, lr, last_reg_layer, epochs in all_combos:

    model = build_model(num_layers, lr, Xtrain_norm_df, last_reg_layer)

    history = model.fit(
        x = Xtrain_norm_df,
        y = ytrain_df,
        epochs=epochs,
        validation_data = (Xvalid_norm_df, yvalid_df),
        verbose=0
    )

    y_hat_valid = model.predict(Xvalid_norm_df)
    perf_valid = r2_score(yvalid_df, y_hat_valid)

    y_hat_train = model.predict(Xtrain_norm_df)
    perf_train = r2_score(ytrain_df, y_hat_train)

    results.append({
    'num_layers': num_layers,
    'lr': lr,
    'last_reg_layer': reg_to_str(last_reg_layer),
    'epochs': epochs,
    'r2_train': perf_train,
    'r2_valid': perf_valid,
    'model': model,
    })

results_df = pd.DataFrame(results)

print(results_df.head())


In [None]:
results_df['last_reg_layer'] = results_df['last_reg_layer'].fillna("None")
results_df

In [None]:
best_model_NN_idx = results_df['r2_valid'].idxmax()
best_model_NN = results_df.loc[best_model_NN_idx]
best_model_NN

In [None]:
best_model_NN_idx_train = results_df['r2_train'].idxmax()
worst_model_NN_idx = results_df['r2_valid'].idxmin()
worst_model_NN_idx_train = results_df['r2_train'].idxmin()

best_model_NN_train = results_df.loc[best_model_NN_idx_train]
worst_model_NN = results_df.loc[worst_model_NN_idx]
worst_model_NN_train = results_df.loc[worst_model_NN_idx_train]

best_model_NN_train

In [None]:
worst_model_NN

In [None]:
worst_model_NN_train

In [None]:
param_names = list(param_grid.keys())
sweeps = []

for i in param_names:
    list_param_values = sorted(results_df[i].unique())
    param_names_copy = [p for p in param_names if p != i]

    sweep = results_df[
        (results_df[param_names_copy[0]] == best_model_NN[param_names_copy[0]]) &
        (results_df[param_names_copy[1]] == best_model_NN[param_names_copy[1]]) &
        (results_df[param_names_copy[2]] == best_model_NN[param_names_copy[2]]) 
    ]
    sweeps.append(sweep)


In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()
bar_width = 0.5
names = ['# of Layers', 'Learning Rate', 'Linear Actication Layer', '# of Epochs']

for idx, sweep in enumerate(sweeps):
    x_vals = sweep[param_names[idx]]
    y_vals = sweep['r2_valid']
    positions = np.arange(len(x_vals))
    
    axes[idx].bar(positions, y_vals, width=bar_width, color='red')
    axes[idx].set_xticks(positions)
    axes[idx].set_xticklabels(x_vals, rotation=45)
    axes[idx].set_title(names[idx])
    axes[idx].set_ylabel("R²")
    axes[idx].set_xlabel(param_names[idx])

plt.subplots_adjust(hspace=0.35)



---

<h1><center>Random Forest

---

In [None]:
results_df.head()

---

<h1><center>Extra Workspace

---