# Project 4

## Retrieve data Using PySpark

In [1]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.5.0'
spark_version = 'spark-3.5.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [662 kB]
Hit:8 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:10 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,057 kB]
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease

In [2]:
# Import packages
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType,IntegerType

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Read in data from S3 Buckets
from pyspark import SparkFiles

path = r'/content/drive/Othercomputers/My PC/Data Analytics Bootcamp/23-Project-4-Week-1/0_Project/'
filepath = path + "Project-4/Resources/winequality-red.csv"

print(filepath)

spark.sparkContext.addFile(filepath)
df = spark.read.csv(SparkFiles.get("winequality-red.csv"), header=True, inferSchema=True, quote="\"", escape="\"")

# Show DataFrame
df.show()

/content/drive/Othercomputers/My PC/Data Analytics Bootcamp/23-Project-4-Week-1/0_Project/Project-4/Resources/winequality-red.csv
+-------------+----------------+-----------+--------------+-------------------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|          chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+-------------------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.4|             0.7|        0.0|           1.9|              0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|
|          7.8|            0.88|        0.0|           2.6|              0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|
|          7.8|            0.76|       0.04|           2.3|           

In [5]:
#Create a temporary view.
df.createOrReplaceTempView('wine_data')

In [6]:
import pandas as pd

pandas_df=spark.sql("""
SELECT * from wine_data
""").toPandas()

# pandas_df.to_csv('top5Gold.csv')

In [7]:
pandas_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## Check data and clean if required

In [8]:
pandas_df["quality"].max()

8

In [9]:
pandas_df["quality"].min()

3

In [10]:
pandas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int32  
dtypes: float64(11), int32(1)
memory usage: 143.8 KB


In [11]:
pandas_df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [12]:
pandas_df["quality"].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [13]:
pandas_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


### Everything looks good and clean. Let's proceed

## Define Target based on quality rating: 7 or above =  good. Otherwise bad.

In [14]:

target = []
for i in pandas_df['quality']:
    if i >= 7:
        target.append('1')
    else:
        target.append('0')
pandas_df['target'] = target

target_names = ['bad quality', 'good quality']

pandas_df.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,target
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0


In [16]:
pandas_df['target'].unique()

array(['0', '1'], dtype=object)

In [17]:
X = pandas_df.iloc[:,:11]
y = pandas_df['target']

In [18]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [19]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: object

In [20]:
#features
print(X)

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0               7.4             0.700         0.00             1.9      0.076   
1               7.8             0.880         0.00             2.6      0.098   
2               7.8             0.760         0.04             2.3      0.092   
3              11.2             0.280         0.56             1.9      0.075   
4               7.4             0.700         0.00             1.9      0.076   
...             ...               ...          ...             ...        ...   
1594            6.2             0.600         0.08             2.0      0.090   
1595            5.9             0.550         0.10             2.2      0.062   
1596            6.3             0.510         0.13             2.3      0.076   
1597            5.9             0.645         0.12             2.0      0.075   
1598            6.0             0.310         0.47             3.6      0.067   

      free sulfur dioxide  

In [21]:
#target
y.value_counts()


0    1382
1     217
Name: target, dtype: int64

## Preprocessing: Split test and training data & Standard Scaler

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [23]:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [24]:
# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

In [66]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Run Models

## Some imports

In [113]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import tensorflow as tf

!pip install keras-tuner
import keras_tuner as kt


import numpy as np




## Logistic Regression - `solver='lbfgs'`

In [27]:
# instantiate the Logistic Regression model as classifier

classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

In [28]:
# Fit model using scaled training data

lr_model = classifier.fit(X_train_scaled, y_train)
lr_model


## Create predictded values for test and train

In [29]:
#Generate training predictions
# training_predictions = lr_model.predict(X_train.values)

#Generate testing predictions
testing_predictions = classifier.predict(X_test.values)

## Confusion Matrix for testing data

In [30]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[317  38]
 [ 32  13]]


## Classification report for testing

In [31]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions, target_names=target_names)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

 bad quality       0.91      0.89      0.90       355
good quality       0.25      0.29      0.27        45

    accuracy                           0.82       400
   macro avg       0.58      0.59      0.59       400
weighted avg       0.83      0.82      0.83       400



## Logistic Regression - `solver='liblinear'`

In [32]:
# instantiate the Logistic Regression model as classifier

classifier = LogisticRegression(solver='liblinear',
                                max_iter=200,
                                random_state=1)
classifier

In [33]:
# Fit model using scaled training data

lr_model = classifier.fit(X_train_scaled, y_train)
lr_model


## Create predictded values for test and train

In [34]:
#Generate training predictions
# training_predictions = lr_model.predict(X_train.values)

#Generate testing predictions
testing_predictions = classifier.predict(X_test.values)

## Confusion Matrix for testing data

In [35]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[317  38]
 [ 31  14]]


## Classification report for testing

In [36]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions, target_names=target_names)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

 bad quality       0.91      0.89      0.90       355
good quality       0.27      0.31      0.29        45

    accuracy                           0.83       400
   macro avg       0.59      0.60      0.60       400
weighted avg       0.84      0.83      0.83       400



## Logistic Regression with `RandomOverSampler` from imblearn

In [37]:
# Import the RandomOverSampler module form imbalanced-learn

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state=1)
# Fit the original training data to the random_oversampler model
X_res, y_res = ros.fit_resample(X_train, y_train)

In [38]:
# Count the distinct values of the resampled labels data
y_res.value_counts()

0    1027
1    1027
Name: target, dtype: int64

## `LogisticRegression` classifier

In [39]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1,
                                               max_iter=1000)

# Fit the model using the resampled training data
lr_model_res = logistic_regression_model.fit(X_res, y_res)

# Make a prediction using the testing data
predictions_res = lr_model_res.predict(X_test)

## Confusion Matrix for testing data

In [40]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, predictions_res)

# Print the confusion matrix for the testing data
print(test_matrix)

[[274  81]
 [  7  38]]


## Classification report for testing

In [41]:
print(classification_report(y_test, predictions_res, target_names=target_names))


              precision    recall  f1-score   support

 bad quality       0.98      0.77      0.86       355
good quality       0.32      0.84      0.46        45

    accuracy                           0.78       400
   macro avg       0.65      0.81      0.66       400
weighted avg       0.90      0.78      0.82       400



## try different models other than Logistic Regression

## random forest

In [42]:
rf_classifier = RandomForestClassifier()
rf_model = rf_classifier.fit(X_train, y_train)
rf_predict= rf_classifier.predict(X_test)

In [43]:
rf_conf_matrix = confusion_matrix(y_test, rf_predict)
rf_acc_score = accuracy_score(y_test, rf_predict)
print(rf_conf_matrix)
print(rf_acc_score*100)

[[346   9]
 [ 23  22]]
92.0


In [44]:
#Generate training predictions
training_predictions = rf_model.predict(X_train)

#Generate testing predictions
testing_predictions = rf_classifier.predict(X_test)

## Confusion Matrix for testing data

In [45]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[346   9]
 [ 23  22]]


## Classification report for testing

In [46]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions, target_names=target_names)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

 bad quality       0.94      0.97      0.96       355
good quality       0.71      0.49      0.58        45

    accuracy                           0.92       400
   macro avg       0.82      0.73      0.77       400
weighted avg       0.91      0.92      0.91       400



## SVM classifier

In [47]:

lin_svc_classifier = SVC()
lin_svc_model = lin_svc_classifier.fit(X_train, y_train)
lin_svc_predict = lin_svc_classifier.predict(X_test)

In [48]:
lin_svc_conf_matrix = confusion_matrix(y_test, rf_predict)
lin_svc_acc_score = accuracy_score(y_test, rf_predict)
print(lin_svc_conf_matrix)
print(lin_svc_acc_score*100)

[[346   9]
 [ 23  22]]
92.0


In [49]:
#Generate training predictions
training_predictions = lin_svc_model.predict(X_train)

#Generate testing predictions
testing_predictions = lin_svc_classifier.predict(X_test)

## Confusion Matrix for testing data

In [50]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[355   0]
 [ 44   1]]


## Classification report for testing

In [51]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions, target_names=target_names)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

 bad quality       0.89      1.00      0.94       355
good quality       1.00      0.02      0.04        45

    accuracy                           0.89       400
   macro avg       0.94      0.51      0.49       400
weighted avg       0.90      0.89      0.84       400



In [69]:
type(y_train)

pandas.core.series.Series

In [70]:
type(y_test)

pandas.core.series.Series

In [71]:
# Convert string labels to integers
y_train = y_train.astype(int)
y_test = y_test.astype(int)

## Nueral Network Deep Learning with Keras Tuner

In [64]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh'])

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=30,
        step=5), activation=activation, input_dim=11))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 5)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=40,
            step=5),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [65]:
# Import the kerastuner library

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=40,
    hyperband_iterations=2)




Reloading Tuner from ./untitled_project/tuner0.json


In [72]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 174 Complete [00h 00m 08s]
val_accuracy: 0.8974999785423279

Best val_accuracy So Far: 0.9075000286102295
Total elapsed time: 00h 43m 47s


In [83]:
# Get top 3 model hyperparameters and print the values
top_hyper = tuner.get_best_hyperparameters(3)
for param in top_hyper:
    print(param.values)

{'activation': 'relu', 'first_units': 26, 'num_layers': 1, 'units_0': 21, 'units_1': 31, 'units_2': 11, 'units_3': 31, 'units_4': 6, 'tuner/epochs': 5, 'tuner/initial_epoch': 2, 'tuner/bracket': 3, 'tuner/round': 1, 'tuner/trial_id': '0013'}
{'activation': 'tanh', 'first_units': 1, 'num_layers': 3, 'units_0': 36, 'units_1': 36, 'units_2': 6, 'units_3': 36, 'units_4': 26, 'tuner/epochs': 40, 'tuner/initial_epoch': 14, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0072'}
{'activation': 'tanh', 'first_units': 16, 'num_layers': 4, 'units_0': 21, 'units_1': 21, 'units_2': 31, 'units_3': 1, 'units_4': 31, 'tuner/epochs': 2, 'tuner/initial_epoch': 0, 'tuner/bracket': 3, 'tuner/round': 0}


In [85]:
# Get top model hyperparameters and print the values
best_hyper = tuner.get_best_hyperparameters(1)
for param in best_hyper:
    print(param.values)

{'activation': 'relu', 'first_units': 26, 'num_layers': 1, 'units_0': 21, 'units_1': 31, 'units_2': 11, 'units_3': 31, 'units_4': 6, 'tuner/epochs': 5, 'tuner/initial_epoch': 2, 'tuner/bracket': 3, 'tuner/round': 1, 'tuner/trial_id': '0013'}


In [108]:
type(best_hyper)
best_hyper[0]['tuner/epochs']

5

In [117]:
# Evaluate the top 3 model against the test dataset
top_model = tuner.get_best_models(3)
for model in top_model:
    model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

    testing_report = classification_report(y_test, testing_predictions, target_names=target_names)


13/13 - 0s - loss: 0.2997 - accuracy: 0.9075 - 237ms/epoch - 18ms/step
Loss: 0.29973483085632324, Accuracy: 0.9075000286102295


TypeError: Labels in y_true and y_pred should be of the same type. Got y_true=[0 1] and y_pred=['0' '1']. Make sure that the predictions provided by the classifier coincides with the true labels.

## mr GPT

In [98]:
input_shape = X_train.shape[1]
input_shape

output_units = 2
output_activation = 'sigmoid'

In [None]:
# Evaluate the model on your test data
y_pred = model.predict(X_test)
y_pred_classes = np.round(y_pred).flatten()  # Convert probabilities to 0 or 1
y_true_classes = y_test.values  # Convert pandas Series to NumPy array

# Generate a classification report
print(classification_report(y_true_classes, y_pred_classes))


In [116]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import classification_report

# Define the best hyperparameters
best_hyperparameters = {
    'activation': 'relu',
    'first_units': 26,
    'num_layers': 1,
    'units_0': 21,
    'units_1': 31,
    'units_2': 11,
    'units_3': 31,
    'units_4': 6,
    'tuner/epochs': 5,
    'tuner/initial_epoch': 2,
}

# Assuming you have the correct input_shape and output_units
# input_shape = ...  # Replace with your input shape
output_units = 2  # Binary classification, so 2 output units
output_activation = 'sigmoid'  # Binary classification, so 'sigmoid' activation

# Create the model using the best hyperparameters
model = Sequential()
model.add(Dense(units=best_hyper[0]['first_units'], activation=best_hyper[0]['activation'], input_shape=(input_shape,)))

# For binary classification
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

for i in range(best_hyper[0]['num_layers']):
    model.add(Dense(units=best_hyper[0][f'units_{i}'], activation=best_hyper[0]['activation']))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with your training data
model.fit(X_train, y_train, epochs=best_hyper[0]['tuner/epochs'], initial_epoch=best_hyper[0]['tuner/initial_epoch'])

# Evaluate the model on your test data
y_pred = model.predict(X_test)
y_pred_classes = np.round(y_pred).flatten()  # Convert probabilities to 0 or 1

# Ensure y_test is correctly formatted and has the same number of samples as X_test
y_true_classes = y_test.values.squeeze()  # Squeeze to handle potential extra dimensions

# Generate a classification report
print(classification_report(y_true_classes, y_pred_classes))


Epoch 3/5
Epoch 4/5
Epoch 5/5


ValueError: Found input variables with inconsistent numbers of samples: [400, 8400]