# Data Engineering Internship

## **Data Exploration**

### **Task 1**

**Explore datasets and document schemas, null values, and duplicates in README.md.**

In [6]:
# Import libraries

import pandas as pd

In [10]:
# Load data

click_df = pd.read_csv("clickstream.csv")
trans_df = pd.read_csv("transactions.csv")

In [25]:
# Explore Clickstream

print("\nClickstream Data\n")
print(click_df.info())
print("\nMissing values:\n", click_df.isna().sum())
print("\nDuplicate rows:", click_df.duplicated().sum())


Clickstream Data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   user_id     200000 non-null  int64 
 1   session_id  200000 non-null  object
 2   page_url    200000 non-null  object
 3   click_time  200000 non-null  object
 4   device      200000 non-null  object
 5   location    200000 non-null  object
dtypes: int64(1), object(5)
memory usage: 9.2+ MB
None

Missing values:
 user_id       0
session_id    0
page_url      0
click_time    0
device        0
location      0
dtype: int64

Duplicate rows: 0


In [23]:
# Explore Transactions

print("\nTransactions Data\n")
print(trans_df.info())
print("\nMissing values:\n", trans_df.isna().sum())
print("\nDuplicate rows:", trans_df.duplicated().sum())


Transactions Data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   txn_id    100000 non-null  object 
 1   user_id   100000 non-null  int64  
 2   amount    100000 non-null  float64
 3   currency  100000 non-null  object 
 4   txn_time  100000 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 3.8+ MB
None

Missing values:
 txn_id      0
user_id     0
amount      0
currency    0
txn_time    0
dtype: int64

Duplicate rows: 0


### **Task 2**

**Extract data — read CSVs in chunks (e.g., 50,000 rows) and fetch currency rates via API.**

In [35]:
# Data Extraction

for chunk in pd.read_csv("clickstream.csv", chunksize=50000):
    print("Chunk size:", len(chunk))

Chunk size: 50000
Chunk size: 50000
Chunk size: 50000
Chunk size: 50000


In [37]:
for chunk in pd.read_csv("transactions.csv", chunksize=50000):
    print("Chunk size:", len(chunk))

Chunk size: 50000
Chunk size: 50000


In [44]:
# Fetch currency rates via ExchangeRate API

import requests
import datetime
import json
import os

API_KEY = "b74bf2ded174e3515ea87712"  
url = f"https://v6.exchangerate-api.com/v6/{API_KEY}/latest/USD" # API Endpoint

response = requests.get(url)
data = response.json()

if response.status_code == 200 and data.get("result") == "success":
    rates = data["conversion_rates"]
    print("USD to INR:", rates.get("INR"))

    today = datetime.date.today().strftime("%Y-%m-%d")
    save_dir = f"data/raw/api_currency/{today}" 
    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, "exchange_rates.json") # Save raw JSON response

    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

    print(f"Raw JSON saved to: {save_path}")

else:
    print("API Error:", data)

USD to INR: 88.2341
Raw JSON saved to: data/raw/api_currency/2025-09-07/exchange_rates.json
