In [1]:
# Import necessary libraries
from datasets import load_dataset
from collections import Counter

# Step 1: Load the dataset
print("Downloading and loading the IMDB dataset from Hugging Face...")
imdb = load_dataset("imdb")

# Step 2: Confirm dataset splits and sizes
print("\nDataset Details:")
print(imdb)

train_size = len(imdb['train'])
test_size = len(imdb['test'])

print(f"\nTrain set rows: {train_size}")
print(f"Test set rows: {test_size}")

# Step 3: Check sentiment distribution
train_labels = imdb['train']['label']
test_labels = imdb['test']['label']

print("\nLabel Information:")
print("0 -> Negative Sentiment")
print("1 -> Positive Sentiment")

print("\nTrain sentiment distribution:")
train_distribution = Counter(train_labels)
print(f"Negative (0): {train_distribution[0]}")
print(f"Positive (1): {train_distribution[1]}")

print("\nTest sentiment distribution:")
test_distribution = Counter(test_labels)
print(f"Negative (0): {test_distribution[0]}")
print(f"Positive (1): {test_distribution[1]}")

# Final Verification
if train_size == 25000 and test_size == 25000:
    print("\n Dataset successfully loaded and verified!")
else:
    print("\n Dataset size does not match expectations. Please recheck the source.")


Downloading and loading the IMDB dataset from Hugging Face...

Dataset Details:
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

Train set rows: 25000
Test set rows: 25000

Label Information:
0 -> Negative Sentiment
1 -> Positive Sentiment

Train sentiment distribution:
Negative (0): 12500
Positive (1): 12500

Test sentiment distribution:
Negative (0): 12500
Positive (1): 12500

 Dataset successfully loaded and verified!


### Database Setup with SQLite

In [2]:
# Import necessary libraries
import sqlite3
# Combine datasets and mark splits
print("Preparing data for database insertion...")
train_data = [(review['text'], 'positive' if review['label'] == 1 else 'negative', 'train') for review in imdb['train']]
test_data = [(review['text'], 'positive' if review['label'] == 1 else 'negative', 'test') for review in imdb['test']]
all_data = train_data + test_data

Preparing data for database insertion...


In [3]:
# Step 1: Connect to SQLite database
conn = sqlite3.connect("imdb_reviews.db")
cursor = conn.cursor()

In [4]:
# Step 2: Create the table
print("Creating the table imdb_reviews...")
cursor.execute("""
CREATE TABLE IF NOT EXISTS imdb_reviews (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    review_text TEXT NOT NULL,
    sentiment TEXT NOT NULL,
    data_split TEXT NOT NULL
);
""")
conn.commit()
print("Table created successfully!")

Creating the table imdb_reviews...
Table created successfully!


In [5]:
# Step 3: Insert all data into the table
print("Inserting all data into the table...")
cursor.executemany("INSERT INTO imdb_reviews (review_text, sentiment, data_split) VALUES (?, ?, ?);", all_data)
conn.commit()
print(f"Inserted {len(all_data)} rows into the table.")

Inserting all data into the table...
Inserted 50000 rows into the table.


In [6]:
# Step 4: Verify the data in the table
print("Verifying the first 5 rows in the table...")
cursor.execute("SELECT * FROM imdb_reviews LIMIT 5;")
rows = cursor.fetchall()
for row in rows:
    print(row)

Verifying the first 5 rows in the table...
(1, 'i rented i am curiousyellow from my video store because of all the controversy that surrounded it when it was first released in 1967 i also heard that at first it was seized by us customs if it ever tried to enter this country therefore being a fan of films considered controversial i really had to see this for myselfthe plot is centered around a young swedish drama student named lena who wants to learn everything she can about life in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states in between asking politicians and ordinary denizens of stockholm about their opinions on politics she has sex with her drama teacher classmates and married menwhat kills me about i am curiousyellow is that 40 years ago this was considered pornographic really the sex and nudity scenes are few and far betwee

In [7]:
# Step 5: Close the database connection
conn.close()
print("Database setup complete. Connection closed.")

Database setup complete. Connection closed.
