<a href="https://colab.research.google.com/github/sindhusatish707/Exploring_Data_Engineering/blob/main/Exploring_Data_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Day 1

In [None]:
import requests
import pandas as pd
import gc

In [None]:
url = "https://jsonplaceholder.typicode.com/posts"
response = requests.get(url)
data = response.json()

In [None]:
df = pd.DataFrame(data)
df = df[['userId', 'id', 'title']]
df = df.rename(columns={'id': 'postId'})

In [None]:
df

Unnamed: 0,userId,postId,title
0,1,1,sunt aut facere repellat provident occaecati e...
1,1,2,qui est esse
2,1,3,ea molestias quasi exercitationem repellat qui...
3,1,4,eum et est occaecati
4,1,5,nesciunt quas odio
...,...,...,...
95,10,96,quaerat velit veniam amet cupiditate aut numqu...
96,10,97,quas fugiat ut perspiciatis vero provident
97,10,98,laboriosam dolor voluptates
98,10,99,temporibus sit alias delectus eligendi possimu...


index=False → tells pandas not to write the row indices (the numbers 0, 1, 2, …) into the CSV file

In [None]:
df.to_csv('posts_clean.csv', index=False)

In [None]:
df.head()

Unnamed: 0,userId,postId,title
0,1,1,sunt aut facere repellat provident occaecati e...
1,1,2,qui est esse
2,1,3,ea molestias quasi exercitationem repellat qui...
3,1,4,eum et est occaecati
4,1,5,nesciunt quas odio


In [None]:
import gc

def free_memory():
    gc.collect()

In [None]:
free_memory()

## Day 2

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
data = {
    "customer_id": [1, 2, 3, 4, 5],
    "order_amount": [250, 300, 450, 270, 320],
    "region": ["East", "West", "North", "East", "South"],
    "churned": [0, 0, 1, 0, 1]
}

In [None]:
df = pd.DataFrame(data)

In [None]:
df_clean = df.dropna()
print("-------- Data Engineer - Cleaned Dataset --------")
print(df_clean.to_string(index = False))
print('\n')

-------- Data Engineer - Cleaned Dataset --------
 customer_id  order_amount region  churned
           1           250   East        0
           2           300   West        0
           3           450  North        1
           4           270   East        0
           5           320  South        1




In [None]:
avg_order = df_clean['order_amount'].mean()
sales_by_region = df_clean.groupby('region')['order_amount'].sum().reset_index()

In [None]:
print("-------- Data Analyst - Insights --------")
print(f"Average order amount {avg_order:.2f}")
print('\nSales by region:')
print(sales_by_region.to_string(index = False))
print('\n')

-------- Data Analyst - Insights --------
Average order amount 318.00

Sales by region:
region  order_amount
  East           520
 North           450
 South           320
  West           300




In [None]:
X = df_clean[['order_amount']]
y = df_clean['churned']

In [None]:
model = LogisticRegression()
model.fit(X, y)
prediction = model.predict([[500]])[0]



In [None]:
print("-------- Data Scientist - Prediction --------")
print(f"Prediction for a customer with an order amount of 500: {prediction} \
(0: Not churn, 1: Churn)")

-------- Data Scientist - Prediction --------
Prediction for a customer with an order amount of 500: 1 (0: Not churn, 1: Churn)


In [None]:
free_memory()

## Day 3

In [None]:
import sqlite3

In [None]:
# Create in-memory database
conn = sqlite3.connect(':memory:')

In [None]:
data = {
    "customer_id": [1, 2, 3, 4, 5],
    "order_amount": [250, 300, 450, 270, 320],
    "region": ["East", "West", "North", "East", "South"],
    "churned": [0, 0, 1, 0, 1]
}

In [None]:
df = pd.DataFrame(data)

In [None]:
# Load data into SQLite (simulating a small warehouse)
df.to_sql('orders', conn, index=False, if_exists='replace')

5

In [None]:
query = "SELECT region, AVG(order_amount) as avg_order FROM orders GROUP BY region"
result = pd.read_sql_query(query, conn)

In [None]:
print(result)

  region  avg_order
0   East      260.0
1  North      450.0
2  South      320.0
3   West      300.0


## Day 4

In [1]:
import sqlite3
import pandas as pd

In [2]:
conn = sqlite3.connect(":memory:")
orders = pd.DataFrame({
    "customer_id": [1, 2, 3, 1, 5],
    "order_id": [101, 104, 110, 107, 108],
    "order_amount": [250, 300, 450, 270, 320],
    "region": ["East", "West", "North", "East", "South"],
    "churned": [0, 0, 1, 0, 1]
})
orders.to_sql("orders", conn, index=False, if_exists="replace")

5

In [3]:
print("Database (OLTP) - Get customer 1 orders")
print(pd.read_sql_query("SELECT * FROM orders WHERE customer_id = 1", conn))

Database (OLTP) - Get customer 1 orders
   customer_id  order_id  order_amount region  churned
0            1       101           250   East        0
1            1       107           270   East        0


In [4]:
# Simulating a warehouse OLAP
print('\nWarehouse (OLAP) - Aggregate sales by customer')
print(pd.read_sql_query("SELECT customer_id, SUM(order_amount) as total_sales FROM orders GROUP BY customer_id", conn))


Warehouse (OLAP) - Aggregate sales by customer
   customer_id  total_sales
0            1          520
1            2          300
2            3          450
3            5          320


In [1]:
# Simulating Data Lake
print("Data Lake - Store raw JSON")
raw_data = [
    {"event": "page_view", "user": 2, "page": "/home"},
    {"event": "click", "user": 3, "page": "buy"}
]
print(raw_data)

Data Lake - Store raw JSON
[{'event': 'page_view', 'user': 2, 'page': '/home'}, {'event': 'click', 'user': 3, 'page': 'buy'}]


In [6]:
free_memory()

## Day 5

In [7]:
import pandas as pd
import time

In [8]:
orders = [
    {"order_id": 1, "amount": 250},
    {"order_id": 2, "amount": 520},
    {"order_id": 3, "amount": 670},
    {"order_id": 4, "amount": 400},
    {"order_id": 5, "amount": 350},
    {"order_id": 6, "amount": 200}
]

In [9]:
df = pd.DataFrame(orders)
print("Batch Processing - Total Orders")
print(df["amount"].sum())

Batch Processing - Total Orders
2390


In [10]:
print("Stream Processing - Processing as data arrives")
total = 0
for order in orders:
    total += order["amount"]
    print(f"Processed order {order['order_id']} | Running total = {total}")
    time.sleep(1)

Stream Processing - Processing as data arrives
Processed order 1 | Running total = 250
Processed order 2 | Running total = 770
Processed order 3 | Running total = 1440
Processed order 4 | Running total = 1840
Processed order 5 | Running total = 2190
Processed order 6 | Running total = 2390
