# Set Up the SQLite Database
Create a database to store the IMDB dataset (text reviews and sentiment labels).
python



In [None]:
import sqlite3
import pandas as pd
import tensorflow as tf
from tensorflow.keras.datasets import imdb4
import numpy as np

# Create/connect to SQLite database
conn = sqlite3.connect('imdb_reviews.db')
cursor = conn.cursor()

# Create table for IMDB reviews
cursor.execute('''
    CREATE TABLE IF NOT EXISTS reviews (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        review_text TEXT NOT NULL,
        sentiment INTEGER NOT NULL
    )
''')

# Load IMDB dataset from TensorFlow
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)

# Decode word indices back to text
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

def decode_review(encoded_review):
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_review])

# Insert training data into SQLite
for review, label in zip(x_train, y_train):
    text = decode_review(review)
    cursor.execute('INSERT INTO reviews (review_text, sentiment) VALUES (?, ?)', (text, int(label)))
conn.commit()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


#Retrieve Data with SQL



In [None]:
# Fetch data from SQLite
query = 'SELECT review_text, sentiment FROM reviews LIMIT 25000'  # Limit for demo
df = pd.read_sql_query(query, conn)

# Split into features and labels
reviews = df['review_text'].values
labels = df['sentiment'].values

#Preprocess the Text Data

Convert text reviews into numerical format for TensorFlow.

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize and pad sequences
max_words = 10000
max_len = 200
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
x_data = pad_sequences(sequences, maxlen=max_len)

# Convert labels to categorical
y_data = tf.keras.utils.to_categorical(labels, 2)

#Split Data and Build the Model

Use a simple LSTM model for text classification.



In [None]:
from tensorflow.keras import models, layers
from sklearn.model_selection import train_test_split

# Split into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Build the model
model = models.Sequential([
    layers.Embedding(max_words, 128, input_length=max_len),
    layers.LSTM(64, return_sequences=False),
    layers.Dense(32, activation='relu'),
    layers.Dense(2, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Display model summary
model.summary()



#Train the Model

Train the model on the processed data.



In [None]:
history = model.fit(x_train, y_train, epochs=5, batch_size=64,
                    validation_data=(x_val, y_val))

Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 287ms/step - accuracy: 0.6947 - loss: 0.5422 - val_accuracy: 0.8672 - val_loss: 0.3084
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 288ms/step - accuracy: 0.9074 - loss: 0.2397 - val_accuracy: 0.8532 - val_loss: 0.3297
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 281ms/step - accuracy: 0.9480 - loss: 0.1489 - val_accuracy: 0.8630 - val_loss: 0.3364
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 265ms/step - accuracy: 0.9647 - loss: 0.1009 - val_accuracy: 0.8594 - val_loss: 0.4267
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 282ms/step - accuracy: 0.9764 - loss: 0.0681 - val_accuracy: 0.8532 - val_loss: 0.5102


#Save Predictions to SQLite

Make predictions on a subset of data and store them back in the database.



In [None]:
# Create table for predictions
cursor.execute('''
    CREATE TABLE IF NOT EXISTS predictions (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        review text TEXT NOT NULL,
        sentiment INTEGER,
        predicted sentiment INTEGER,
        confidence REAL
    )
''')

# Predict on validation set
predictions = model.predict(x_val[:100])  # Limit for demo
predicted_labels = np.argmax(predictions, axis=1)
confidences = np.max(predictions, axis=1)

# Insert predictions into SQLite
for review, true_label, pred_label, confidence in zip(x_val[:100], np.argmax(y_val[:100], axis=1), predicted_labels, confidences):
    review_text = ' '.join([tokenizer.index_word.get(i, '?') for i in review if i != 0])
    cursor.execute('INSERT INTO predictions (review_text, sentiment, predicted_sentiment, confidence) VALUES (?, ?, ?, ?)',
                  (review_text, int(true_label), int(pred_label), float(confidence)))
conn.commit()

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 92ms/step


#Evaluate and Visualize

Query the predictions and evaluate the model.



In [None]:
import matplotlib.pyplot as plt

# Query predictions
df_preds = pd.read_sql_query('SELECT * FROM predictions', conn)
print(df_preds.head())

# Plot training history
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Close database connection
conn.close()

#Database Interaction

To explore the database:



In [None]:
conn = sqlite3.connect('imdb_reviews.db')
df = pd.read_sql_query('SELECT * FROM reviews LIMIT 5', conn)
print(df)
conn.close()