In [1]:
# Code1.py (modified and extended for Colab)

# Step 0: Install/Update scikeras, scikit-learn, and tensorflow
#!pip install -U tensorflow scikit-learn scikeras
!pip install tensorflow==2.19.0 scikit-learn==1.4.2 scikeras==0.13.0 keras==3.9.2



In [2]:
# Ensure this cell is run AFTER restarting the runtime post-pip install

import tensorflow
import sklearn
import scikeras
import keras as keras_standalone

print(f"TensorFlow version: {tensorflow.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}") # Should now be 1.4.2
print(f"SciKeras version: {scikeras.__version__}")
print(f"Keras (standalone) version: {keras_standalone.__version__}")


import pandas as pd
import numpy as np
import re
import pickle

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from scikeras.wrappers import KerasClassifier as SciKerasClassifier

np.random.seed(42)
tf.random.set_seed(42)

try:
    data = pd.read_csv('Data.csv', encoding='latin1')
except FileNotFoundError:
    print("Error: Data.csv not found. Using dummy data.")
    data = pd.DataFrame({
        'text': ['this is a positive tweet', 'this is a neutral one', 'this is a bad negative tweet', 'another good one', 'just a tweet', 'very bad news indeed']*500,
        'sentiment': ['Positive', 'Neutral', 'Negative', 'Positive', 'Neutral', 'Negative']*500
    })

if 'text' not in data.columns or 'sentiment' not in data.columns:
    if 'tweet_text' in data.columns and 'sentiment_sentiment' in data.columns:
        data = data[['tweet_text', 'sentiment_sentiment']]
        data.rename(columns={'tweet_text': 'text', 'sentiment_sentiment': 'sentiment'}, inplace=True)
    else:
        raise ValueError("Required columns for text and sentiment not found.")
else:
    data = data[['text','sentiment']]

data.dropna(subset=['text', 'sentiment'], inplace=True)
data['sentiment'] = data['sentiment'].astype(str)
valid_sentiments = ['Positive', 'Negative', 'Neutral']
data = data[data['sentiment'].isin(valid_sentiments)]

if data.empty:
    raise ValueError("No valid data remaining after filtering.")

data['text'] = data['text'].astype(str).apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))
for idx in data.index:
    data.loc[idx, 'text'] = data.loc[idx, 'text'].replace('rt', ' ')

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X_sequences = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X_sequences)
max_sequence_length = X.shape[1]
if max_sequence_length == 0 and len(X_sequences) > 0:
    max_sequence_length = 1
    X = pad_sequences(X_sequences, maxlen=max_sequence_length)

labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)
num_classes = y.shape[1]

if num_classes < 2:
    raise ValueError(f"Only {num_classes} sentiment class(es) found.")

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y if num_classes > 1 else None)

print(f"X_train shape: {X_train.shape}, Y_train shape: {Y_train.shape}")
print(f"X_test shape: {X_test.shape}, Y_test shape: {Y_test.shape}")
print(f"Max sequence length: {max_sequence_length}")
print(f"Number of classes: {num_classes}")
print(f"Sentiment classes: {labelencoder.classes_}")

print("\n--- Task 1: Execute, Save Model, and Predict ---")
embed_dim_orig = 128
lstm_out_orig = 196

def createmodel_original():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(max_fatures, embed_dim_orig))
    model.add(tf.keras.layers.LSTM(lstm_out_orig, dropout=0.2, recurrent_dropout=0.2))
    model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model_orig = createmodel_original()
print("\nOriginal Model Summary (may be unbuilt until fit):")
model_orig.summary()

batch_size_orig = 32
epochs_orig = 1
print(f"\nTraining original model for {epochs_orig} epoch(s)...")
history_orig = model_orig.fit(X_train, Y_train, epochs=epochs_orig, batch_size=batch_size_orig, verbose=2, validation_data=(X_test, Y_test))
print("\nOriginal Model Summary (after training):")
model_orig.summary()

score_orig, acc_orig = model_orig.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size_orig)
print(f"\nOriginal Model Performance on Test Set:")
print(f"Score (Loss): {score_orig:.4f}")
print(f"Accuracy: {acc_orig:.4f}")

model_orig.save('sentiment_model_original.keras')
with open('tokenizer_original.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
with open('labelencoder_original.pkl', 'wb') as f:
    pickle.dump(labelencoder, f)
print("\nOriginal model (as .keras), tokenizer, and label encoder saved.")

new_text_example = "A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump"
loaded_model = tf.keras.models.load_model('sentiment_model_original.keras')
with open('tokenizer_original.pkl', 'rb') as f:
    loaded_tokenizer = pickle.load(f)
with open('labelencoder_original.pkl', 'rb') as f:
    loaded_labelencoder = pickle.load(f)

processed_text = new_text_example.lower()
processed_text = re.sub('[^a-zA-z0-9\s]', '', processed_text)
processed_text = processed_text.replace('rt', ' ')
sequences_new = loaded_tokenizer.texts_to_sequences([processed_text])
padded_sequences_new = pad_sequences(sequences_new, maxlen=max_sequence_length)
prediction_probs = loaded_model.predict(padded_sequences_new)
predicted_class_index = np.argmax(prediction_probs, axis=1)
predicted_sentiment = loaded_labelencoder.inverse_transform(predicted_class_index)
print(f"\nPrediction for new text: '{new_text_example}'")
print(f"Predicted Sentiment: {predicted_sentiment[0]}")
print(f"Prediction Probabilities (for classes {loaded_labelencoder.classes_}): {prediction_probs[0]}")

print("\n--- Task 2: Apply GridSearchCV ---")

class MyKerasClassifier(SciKerasClassifier):
    def __init__(self, model=None, **kwargs):
        super().__init__(model=model, **kwargs)

    @property
    def _estimator_type(self):
        return "classifier"

def createmodel_for_grid(embed_dim_param=128, lstm_out_param=196, dropout_param=0.2, optimizer_param='adam', meta=None):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(max_fatures, embed_dim_param))
    model.add(tf.keras.layers.LSTM(lstm_out_param, dropout=dropout_param, recurrent_dropout=dropout_param))
    model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optimizer_param, metrics=['accuracy'])
    return model

keras_clf = MyKerasClassifier(
    model=createmodel_for_grid,
    verbose=0,
)

param_grid = {
    'batch_size': [32, 64],
    'epochs': [1, 2],
    'model__embed_dim_param': [64, 128],
    'model__lstm_out_param': [100, 150],
    'model__dropout_param': [0.2, 0.3],
    'model__optimizer_param': ['adam', 'rmsprop']
}

grid = GridSearchCV(estimator=keras_clf, param_grid=param_grid, cv=2, verbose=2, n_jobs=1)

print("\nStarting GridSearchCV... This may take a while.")
num_combinations = 1
for p_list in param_grid.values():
    num_combinations *= len(p_list)
print(f"Parameter grid size: {num_combinations} combinations")
print(f"Cross-validation folds: {grid.cv}") # Access cv from the grid object
print(f"Total model fits for GridSearchCV: {num_combinations * grid.cv}") # Access cv from the grid object

try:
    grid_result = grid.fit(X_train, Y_train)

    print(f"\nBest Accuracy for GridSearchCV: {grid_result.best_score_:.4f} using {grid_result.best_params_}")

    print("\nAll combinations and their scores from GridSearchCV:")
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param_combination in zip(means, stds, params):
        print(f"Mean Accuracy: {mean:.4f} (Std: {stdev:.4f}) with: {param_combination}")

except Exception as e:
    print(f"\nAn error occurred during GridSearchCV: {e}")
    import traceback
    traceback.print_exc()
    print("GridSearchCV was not completed.")

print("\n--- Script Finished ---")

TensorFlow version: 2.19.0
Scikit-learn version: 1.4.2
SciKeras version: 0.13.0
Keras (standalone) version: 3.9.2
X_train shape: (9293, 28), Y_train shape: (9293, 3)
X_test shape: (4578, 28), Y_test shape: (4578, 3)
Max sequence length: 28
Number of classes: 3
Sentiment classes: ['Negative' 'Neutral' 'Positive']

--- Task 1: Execute, Save Model, and Predict ---

Original Model Summary (may be unbuilt until fit):



Training original model for 1 epoch(s)...
291/291 - 58s - 199ms/step - accuracy: 0.6357 - loss: 0.8331 - val_accuracy: 0.6730 - val_loss: 0.7615

Original Model Summary (after training):


144/144 - 4s - 30ms/step - accuracy: 0.6730 - loss: 0.7615

Original Model Performance on Test Set:
Score (Loss): 0.7615
Accuracy: 0.6730

Original model (as .keras), tokenizer, and label encoder saved.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 451ms/step

Prediction for new text: 'A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump'
Predicted Sentiment: Negative
Prediction Probabilities (for classes ['Negative' 'Neutral' 'Positive']): [0.47726592 0.22568074 0.2970533 ]

--- Task 2: Apply GridSearchCV ---

Starting GridSearchCV... This may take a while.
Parameter grid size: 64 combinations
Cross-validation folds: 2
Total model fits for GridSearchCV: 128
Fitting 2 folds for each of 64 candidates, totalling 128 fits
[CV] END batch_size=32, epochs=1, model__dropout_param=0.2, model__embed_dim_param=64, model__lstm_out_param=100, model__optimizer_param=adam; total time=   0.0s
[CV] END batch_size=32,

Traceback (most recent call last):
  File "<ipython-input-2-f7f17836bd50>", line 187, in <cell line: 0>
    grid_result = grid.fit(X_train, Y_train)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_search.py", line 970, in fit
    self._run_search(evaluate_candidates)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_search.py", line 1527, in _run_search
    evaluate_candidates(ParameterGrid(self.param_grid))
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_search.py", line 947, in evaluate_candidates
    _warn_or_raise_about_fit_failures(out, self.error_score)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 536, in _warn_or_raise_about_fit_f