## **Creating and Populating a New SQLite Database from an Existing Schema**

### **Overview**
In this notebook, we will:
1. **Extract the Schema** from an existing SQLite database (`movies.db`).
2. **Create a New Database** (`movies.red.db`) with the same schema as the original.
3. **Populate the New Database** with sample data using `pandas`.

---

### **Step 1: Extracting the Schema from `movies.db`**
In this step, we connect to the existing `movies.db` database and retrieve its schema using SQLite’s system table `sqlite_master`. The schema will include the structure of all the tables, which we will later use to create a new database with the same structure.

---

### **Step 2: Creating `movies.red.db` with the Extracted Schema**
Using the schema extracted in Step 1, we create a new SQLite database (`movies.red.db`) and apply the schema to set up the necessary tables. This ensures the new database mirrors the structure of the original.

---

### **Step 3: Populating Tables in `movies.red.db`**
After creating the new database, we populate the tables with sample data. Using `pandas`, we insert entries into the tables (e.g., movies, people, ratings, and stars), ensuring that each table contains around 10 sample records.

---

### **Summary**
- **Step 1:** Extract the schema from `movies.db` and save it to a file.
- **Step 2:** Create a new database (`movies.red.db`) and apply the extracted schema.
- **Step 3:** Populate the new database with sample data.

This approach keeps the **schema extraction, database creation, and data population as separate steps**, providing a structured way to replicate the original database with new data.


### Exploration

In [1]:
import sqlite3

# Connect to the existing database
db_path = "movies.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Query to get the schema for all tables
cursor.execute("SELECT sql FROM sqlite_master WHERE type='table';")
tables_schema = cursor.fetchall()

# Save schema to a file
with open("schema.sql", "w") as f:
    for table_schema in tables_schema:
        if table_schema[0]:  # Ensure it's not None
            f.write(table_schema[0] + ";\n\n")

conn.close()
print("Schema extracted and saved to schema.sql")


Schema extracted and saved to schema.sql


Step 2: Create movies.red.db with the Same Schema


In [2]:
import sqlite3

# Read schema from movies.db
with open("schema.sql", "r") as f:
    schema_sql = f.read()

# Create new database and apply schema
conn = sqlite3.connect("movies.red.db")
cursor = conn.cursor()
cursor.executescript(schema_sql)
conn.commit()
conn.close()


Populating tables.

In [3]:
import sqlite3
import pandas as pd

# Database paths
db_path = "movies.red_o.db"

# Establish connection
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Define schema
schema_sql = """
CREATE TABLE "movies" (
    "id" INTEGER,
    "title" TEXT NOT NULL,
    "year" NUMERIC,
    PRIMARY KEY("id")
);

CREATE TABLE "people" (
    "id" INTEGER,
    "name" TEXT NOT NULL,
    "birth" NUMERIC,
    PRIMARY KEY("id")
);

CREATE TABLE "ratings" (
    "id" INTEGER,
    "movie_id" INTEGER UNIQUE,
    "rating" REAL NOT NULL,
    "votes" INTEGER NOT NULL,
    PRIMARY KEY("id"),
    FOREIGN KEY("movie_id") REFERENCES "movies"("id")
);

CREATE TABLE "stars" (
    "movie_id" INTEGER,
    "person_id" INTEGER,
    PRIMARY KEY("movie_id", "person_id"),
    FOREIGN KEY("movie_id") REFERENCES "movies"("id"),
    FOREIGN KEY("person_id") REFERENCES "people"("id")
);
"""

# Execute schema
cursor.executescript(schema_sql)

# Sample data
movies_data = [
    (1, "Inception", 2010),
    (2, "The Dark Knight", 2008),
    (3, "Interstellar", 2014),
    (4, "Parasite", 2019),
    (5, "The Matrix", 1999),
    (6, "The Godfather", 1972),
    (7, "Forrest Gump", 1994),
    (8, "Gladiator", 2000),
    (9, "Titanic", 1997),
    (10, "The Shawshank Redemption", 1994),
]

people_data = [
    (1, "Leonardo DiCaprio", 1974),
    (2, "Christian Bale", 1974),
    (3, "Matthew McConaughey", 1969),
    (4, "Song Kang-ho", 1967),
    (5, "Keanu Reeves", 1964),
    (6, "Marlon Brando", 1924),
    (7, "Tom Hanks", 1956),
    (8, "Russell Crowe", 1964),
    (9, "Kate Winslet", 1975),
    (10, "Morgan Freeman", 1937),
]

ratings_data = [
    (1, 1, 8.8, 2000000),
    (2, 2, 9.0, 2500000),
    (3, 3, 8.6, 1500000),
    (4, 4, 8.5, 1300000),
    (5, 5, 8.7, 1800000),
    (6, 6, 9.2, 1700000),
    (7, 7, 8.8, 1600000),
    (8, 8, 8.5, 1400000),
    (9, 9, 7.9, 2200000),
    (10, 10, 9.3, 2300000),
]

stars_data = [
    (1, 1),  # Inception - Leonardo DiCaprio
    (2, 2),  # The Dark Knight - Christian Bale
    (3, 3),  # Interstellar - Matthew McConaughey
    (4, 4),  # Parasite - Song Kang-ho
    (5, 5),  # The Matrix - Keanu Reeves
    (6, 6),  # The Godfather - Marlon Brando
    (7, 7),  # Forrest Gump - Tom Hanks
    (8, 8),  # Gladiator - Russell Crowe
    (9, 9),  # Titanic - Kate Winslet
    (10, 10), # The Shawshank Redemption - Morgan Freeman
]

# Insert data using pandas
df_movies = pd.DataFrame(movies_data, columns=["id", "title", "year"])
df_movies.to_sql("movies", conn, if_exists="append", index=False)

df_people = pd.DataFrame(people_data, columns=["id", "name", "birth"])
df_people.to_sql("people", conn, if_exists="append", index=False)

df_ratings = pd.DataFrame(ratings_data, columns=["id", "movie_id", "rating", "votes"])
df_ratings.to_sql("ratings", conn, if_exists="append", index=False)

df_stars = pd.DataFrame(stars_data, columns=["movie_id", "person_id"])
df_stars.to_sql("stars", conn, if_exists="append", index=False)

# Close connection
conn.close()
print("Database created and populated successfully!")


Database created and populated successfully!


Taking a database and simplfying it.

In [None]:
import sqlite3
import pandas as pd

# Define database paths
NEW_DB = "movies.red.db"

# --- Step 1: Extract schema from existing database ---
def extract_schema(db_path, schema_file):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT sql FROM sqlite_master WHERE type='table';")
    
    with open(schema_file, "w") as f:
        for table_schema in cursor.fetchall():
            if table_schema[0]:  # Ensure it's not None
                f.write(table_schema[0] + ";\n\n")
    
    conn.close()
    print(f"Schema extracted and saved to {schema_file}")

# --- Step 2: Create new database with extracted schema ---
def create_new_db(new_db_path, schema_file):
    with open(schema_file, "r") as f:
        schema_sql = f.read()
    
    conn = sqlite3.connect(new_db_path)
    cursor = conn.cursor()
    cursor.executescript(schema_sql)
    conn.commit()
    conn.close()
    print(f"Database {new_db_path} created successfully.")

# --- Step 3: Populate tables ---
def populate_database(db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("PRAGMA foreign_keys = ON;")  # Enforce foreign keys
    
    # Sample data
    movies_data = [
        (1, "Inception", 2010), (2, "The Dark Knight", 2008), (3, "Interstellar", 2014),
        (4, "Parasite", 2019), (5, "The Matrix", 1999), (6, "The Godfather", 1972),
        (7, "Forrest Gump", 1994), (8, "Gladiator", 2000), (9, "Titanic", 1997),
        (10, "The Shawshank Redemption", 1994),
    ]
    
    people_data = [
        (1, "Leonardo DiCaprio", 1974), (2, "Christian Bale", 1974), (3, "Matthew McConaughey", 1969),
        (4, "Song Kang-ho", 1967), (5, "Keanu Reeves", 1964), (6, "Marlon Brando", 1924),
        (7, "Tom Hanks", 1956), (8, "Russell Crowe", 1964), (9, "Kate Winslet", 1975),
        (10, "Morgan Freeman", 1937),
    ]
    
    ratings_data = [
        (1, 1, 8.8, 2000000), (2, 2, 9.0, 2500000), (3, 3, 8.6, 1500000),
        (4, 4, 8.5, 1300000), (5, 5, 8.7, 1800000), (6, 6, 9.2, 1700000),
        (7, 7, 8.8, 1600000), (8, 8, 8.5, 1400000), (9, 9, 7.9, 2200000),
        (10, 10, 9.3, 2300000),
    ]
    
    stars_data = [
        (1, 1), (2, 2), (3, 3), (4, 4), (5, 5),
        (6, 6), (7, 7), (8, 8), (9, 9), (10, 10),
    ]
    
    # Insert data using executemany() for efficiency
    cursor.executemany("INSERT INTO movies (id, title, year) VALUES (?, ?, ?);", movies_data)
    cursor.executemany("INSERT INTO people (id, name, birth) VALUES (?, ?, ?);", people_data)
    cursor.executemany("INSERT INTO ratings (id, movie_id, rating, votes) VALUES (?, ?, ?, ?);", ratings_data)
    cursor.executemany("INSERT INTO stars (movie_id, person_id) VALUES (?, ?);", stars_data)

    conn.commit()
    conn.close()
    print("Database populated successfully.")

# Execute the steps
extract_schema("movies.db", "schema.sql")
create_new_db(NEW_DB, "schema.sql")
populate_database(NEW_DB)
