In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow import keras
from tensorflow.keras import layers

# Load the dataset
data = pd.read_csv('reciepe_reviews.csv')

# Handle missing values
data['text'].fillna('', inplace=True)  # Filling missing text with empty string
data.dropna(subset=['stars'], inplace=True)  # Drop rows with missing stars

# Define features and target variable
X = data.drop(columns=['stars'])  # Features
y = data['stars']  # Target variable

# Encode categorical variables and scale continuous variables
categorical_features = ['recipe_name', 'comment_id', 'user_id', 'user_name', 'text']
numeric_features = ['recipe_number', 'user_reputation', 'created_at', 'reply_count', 'thumbs_up', 'thumbs_down', 'best_score']

# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Preprocess the features
X_processed = preprocessor.fit_transform(X)

# Convert target variable to categorical (if necessary)
y = pd.factorize(y)[0]  # Convert to integer labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Define the ANN model
num_classes = len(np.unique(y))  # Number of unique classes in the target variable
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(num_classes, activation='softmax')  # Use softmax for multi-class classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # Use sparse_categorical_crossentropy for integer labels
              metrics=['accuracy'])  # Use accuracy for classification tasks

# Display the model summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy:.4f}')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['text'].fillna('', inplace=True)  # Filling missing text with empty string
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 77ms/step - accuracy: 0.7587 - loss: 1.0412 - val_accuracy: 0.7601 - val_loss: 0.8181
Epoch 2/10
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 75ms/step - accuracy: 0.8454 - loss: 0.4187 - val_accuracy: 0.7078 - val_loss: 1.0137
Epoch 3/10
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 75ms/step - accuracy: 0.9614 - loss: 0.1097 - val_accuracy: 0.6538 - val_loss: 1.2212
Epoch 4/10
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 75ms/step - accuracy: 0.9857 - loss: 0.0376 - val_accuracy: 0.5270 - val_loss: 1.5391
Epoch 5/10
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 75ms/step - accuracy: 0.9988 - loss: 0.0096 - val_accuracy: 0.5799 - val_loss: 1.4870
Epoch 6/10
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 75ms/step - accuracy: 0.9999 - loss: 0.0013 - val_accuracy: 0.5325 - val_loss: 1.6535
Epoch 7/10
[1m3