In [39]:
import pandas as pd
import sqlite3

In [40]:
path = '/Users/saideepthigali/Downloads/titanic.csv'
df = pd.read_csv(path)

In [41]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Bronze Layer

In [43]:
df.columns = ["PassengerId", "Survived", "Pclass", "Name", "Sex", "Age", "Sibsp", 
              "Parch", "Ticket", "Fare", "Cabin", "Embarked"]

In [44]:
conn = sqlite3.connect("databases.db")
df.to_sql("bronze_titanic", conn, if_exists="replace", index=False)

891

In [45]:
conn.commit()
conn.close()

# Silver Layer

In [52]:
conn = sqlite3.connect("databases.db")
cursor = conn.cursor()

# Check the table structure
cursor.execute("PRAGMA table_info(bronze_titanic)")
columns = cursor.fetchall()
for column in columns:
    print(column)

conn.close()

(0, 'PassengerId', 'INTEGER', 0, None, 0)
(1, 'Survived', 'INTEGER', 0, None, 0)
(2, 'Pclass', 'INTEGER', 0, None, 0)
(3, 'Name', 'TEXT', 0, None, 0)
(4, 'Sex', 'TEXT', 0, None, 0)
(5, 'Age', 'REAL', 0, None, 0)
(6, 'Sibsp', 'INTEGER', 0, None, 0)
(7, 'Parch', 'INTEGER', 0, None, 0)
(8, 'Ticket', 'TEXT', 0, None, 0)
(9, 'Fare', 'REAL', 0, None, 0)
(10, 'Cabin', 'TEXT', 0, None, 0)
(11, 'Embarked', 'TEXT', 0, None, 0)


In [53]:
# Read data from Bronze table
conn = sqlite3.connect('databases.db')
silver_data = pd.read_sql_query("SELECT * FROM bronze_titanic", conn)
conn.close()


In [54]:
silver_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Sibsp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [55]:
silver_data['Age'] = silver_data['Age'].fillna(silver_data['Age'].mean())  # Fill age with mean
silver_data['Embarked'] = silver_data['Embarked'].fillna('S')  # Fill missing embarked with 'S'

# 2. Normalize the fare column (scaling between 0 and 1)
silver_data['Fare_normalized'] = (silver_data['Fare'] - silver_data['Fare'].min()) / (silver_data['Fare'].max() - silver_data['Fare'].min())

# Save cleaned data into Silver table
conn_silver = sqlite3.connect('silver_layer.db')
silver_data.to_sql("silver_titanic", conn_silver, if_exists="replace", index=False)

conn_silver.commit()  # Commit changes
conn_silver.close()  # Close connection

# Gold Layer

In [68]:
import sqlite3
import pandas as pd

# Connect to the Silver layer database and read the data
conn_silver = sqlite3.connect('silver_layer.db')  # Assuming 'databases.db' is the Silver layer DB
df_silver = pd.read_sql_query('SELECT * FROM silver_titanic', conn_silver)

# Filter the data to include only passengers who survived (for example)
df_gold = df_silver[df_silver['Survived'] == 1]  # Only passengers who survived

# Perform aggregation: Count of survivors, average age of survivors, and total fare spent by survivors
df_gold_summary = df_gold.groupby('Pclass').agg(
    survivors_count=('Survived', 'sum'),  # Count of survivors per class
    avg_age=('Age', 'mean'),  # Average age of survivors per class
    total_fare=('Fare', 'sum')  # Total fare spent by survivors per class
).reset_index()

# Create the Gold layer database and save the aggregated data
conn_gold = sqlite3.connect('gold_layer.db')  # This is your Gold layer DB
df_gold_summary.to_sql('gold_titanic', conn_gold, if_exists='replace', index=False)

# Close both connections
conn_silver.close()


# Print the Gold Layer Data
print("\nGold Layer Data:")
print(df_gold_summary)



Gold Layer Data:
   Pclass  survivors_count    avg_age  total_fare
0       1              136  34.784615  13002.6919
1       2               87  26.076166   1918.8459
2       3              119  23.232689   1629.6916


In [70]:
conn_gold.close()

# Insights

In [64]:
conn_bronze = sqlite3.connect('databases.db')
query = "SELECT COUNT(*) FROM bronze_titanic"
total_passengers = pd.read_sql_query(query, conn_bronze)
print("Total Passengers:", total_passengers.iloc[0, 0])



Total Passengers: 891


In [65]:
query = "SELECT Survived, COUNT(*) FROM bronze_titanic GROUP BY Survived"
survival_distribution = pd.read_sql_query(query, conn_bronze)
print("Survival Distribution:")
print(survival_distribution)


Survival Distribution:
   Survived  COUNT(*)
0         0       549
1         1       342


In [66]:
conn_bronze.close()