<a href="https://colab.research.google.com/github/saerarawas/AAI_634O_A11_202520/blob/main/Project/Final_Project_Rawas_Saera_Feb9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import numpy as np

# Define the number of records
num_records = 1000

# Generate random dates within a specific range
dates = pd.date_range(start='2023-01-01', end='2025-12-31', periods=num_records)

# Generate product IDs (e.g., P001, P002, ..., P100)
product_ids = [f'P{str(i).zfill(3)}' for i in range(1, num_records + 1)]

# Generate random sales amounts
sales_amounts = np.random.randint(100, 1000, size=num_records)

# Define a list of store locations
store_locations = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']

# Randomly assign store locations to the records
locations = np.random.choice(store_locations, size=num_records)

# Create a DataFrame with the generated data
data = {
    'date': dates,
    'product_id': product_ids,
    'sales_amount': sales_amounts,
    'store_location': locations
}

df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('sales_data.csv', index=False)

# Display the first few rows of the DataFrame
print(df.head())
# Display the records with the latest dates
latest_records = df.sort_values(by='date', ascending=False).head()
print(latest_records)

                           date product_id  sales_amount store_location
0 2023-01-01 00:00:00.000000000       P001           802       New York
1 2023-01-02 02:18:22.702702702       P002           839        Phoenix
2 2023-01-03 04:36:45.405405405       P003           841       New York
3 2023-01-04 06:55:08.108108108       P004           906        Houston
4 2023-01-05 09:13:30.810810810       P005           709        Phoenix
                             date product_id  sales_amount store_location
999 2025-12-31 00:00:00.000000000      P1000           865        Chicago
998 2025-12-29 21:41:37.297297296       P999           728        Chicago
997 2025-12-28 19:23:14.594594592       P998           340    Los Angeles
996 2025-12-27 17:04:51.891891888       P997           902        Phoenix
995 2025-12-26 14:46:29.189189184       P996           518        Houston


In [9]:
!pip install dash
import pandas as pd
import dash
from dash import dcc, html
import plotly.express as px
from dash.dependencies import Input, Output

# Load sales data from CSV
df = pd.read_csv('sales_data.csv', parse_dates=['date'])

# Aggregate sales by store location
sales_by_location = df.groupby('store_location')['sales_amount'].sum().reset_index()

# Initialize Dash app
app = dash.Dash(__name__)

# Define layout
app.layout = html.Div([
    html.H1("Total Sales by Store Location"),
    dcc.Graph(id="sales_bar_chart"),
])

# Define callback to update graph
@app.callback(
    Output("sales_bar_chart", "figure"),
    Input("sales_bar_chart", "id")
)
def update_graph(_):
    fig = px.bar(sales_by_location, x='store_location', y='sales_amount', title="Total Sales by Store Location")
    return fig

# Run the Dash app
if __name__ == '__main__':
    app.run_server(debug=True)




<IPython.core.display.Javascript object>

In [10]:
!pip install pymongo
import pandas as pd
import pymongo
from dash import Dash, dcc, html
from dash.dependencies import Input, Output
import plotly.express as px

# Connect to MongoDB
client = pymongo.MongoClient("mongodb+srv://tsjannoun123:KufyyNNqnno0atX9@cluster0.sb8py.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
db = client["sales_db"]
collection = db["sales"]

# Load data from CSV
df = pd.read_csv('sales_data.csv', parse_dates=['date'])

# Insert data into MongoDB (if not already inserted)
if collection.count_documents({}) == 0:
    collection.insert_many(df.to_dict(orient="records"))

# Function to retrieve sales data from MongoDB
def get_sales_data():
    data = list(collection.find({}, {"_id": 0}))  # Exclude MongoDB ID field
    return pd.DataFrame(data)

# Dash app setup
app = Dash(__name__)

app.layout = html.Div([
    html.H1("MongoDB Sales Dashboard"),
    dcc.Dropdown(
        id="location-dropdown",
        options=[{"label": loc, "value": loc} for loc in df["store_location"].unique()],
        value=df["store_location"].unique()[0],
        clearable=False
    ),
    dcc.Graph(id="sales_chart"),
])

@app.callback(
    Output("sales_chart", "figure"),
    Input("location-dropdown", "value")
)
def update_chart(selected_location):
    sales_df = get_sales_data()
    filtered_df = sales_df[sales_df["store_location"] == selected_location]
    fig = px.line(filtered_df, x="date", y="sales_amount", title=f"Sales Trend in {selected_location}")
    return fig

if __name__ == "__main__":
    app.run_server(debug=True)




<IPython.core.display.Javascript object>

In [None]:
!pip install apache-airflow
from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime, timedelta
import pandas as pd
import pymongo
import requests

# Define default args for Airflow
default_args = {
    "owner": "airflow",
    "depends_on_past": False,
    "start_date": datetime(2025, 2, 1),
    "retries": 1,
    "retry_delay": timedelta(minutes=5),
}

dag = DAG(
    "sales_etl",
    default_args=default_args,
    schedule_interval="@daily",
)

def extract_sales_data():
    df = pd.read_csv("/path/to/sales_data.csv", parse_dates=["date"])
    df.to_csv("/tmp/sales_data_extracted.csv", index=False)

def extract_weather_data():
    weather_api_url = "https://api.weatherapi.com/v1/history.json?key=YOUR_API_KEY&q=New York&dt=2024-02-09"
    response = requests.get(weather_api_url)
    weather_data = response.json()
    df = pd.DataFrame(weather_data["forecast"]["forecastday"][0]["hour"])
    df.to_csv("/tmp/weather_data.csv", index=False)

def transform_and_load():
    sales_df = pd.read_csv("/tmp/sales_data_extracted.csv")
    weather_df = pd.read_csv("/tmp/weather_data.csv")

    sales_df["date"] = pd.to_datetime(sales_df["date"]).dt.date
    weather_df["time"] = pd.to_datetime(weather_df["time"]).dt.date

    merged_df = pd.merge(sales_df, weather_df, left_on="date", right_on="time", how="left")

    client = pymongo.MongoClient("mongodb://localhost:27017/")
    db = client["sales_db"]
    collection = db["sales_weather"]

    collection.delete_many({})  # Clear previous data
    collection.insert_many(merged_df.to_dict(orient="records"))

extract_task = PythonOperator(task_id="extract_sales_data", python_callable=extract_sales_data, dag=dag)
weather_task = PythonOperator(task_id="extract_weather_data", python_callable=extract_weather_data, dag=dag)
transform_task = PythonOperator(task_id="transform_and_load", python_callable=transform_and_load, dag=dag)

extract_task >> weather_task >> transform_task




<Task(PythonOperator): transform_and_load>

In [None]:
import pandas as pd
import pymongo
from dash import Dash, dcc, html
from dash.dependencies import Input, Output
import plotly.express as px

# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["sales_db"]
collection = db["sales_weather"]

# Function to retrieve merged sales & weather data
def get_combined_data():
    data = list(collection.find({}, {"_id": 0}))
    return pd.DataFrame(data)

# Dash app setup
app = Dash(__name__)

app.layout = html.Div([
    html.H1("Sales & Weather Analysis"),
    dcc.Dropdown(
        id="store-dropdown",
        options=[{"label": loc, "value": loc} for loc in ["New York", "Los Angeles", "Chicago"]],
        value="New York",
        clearable=False
    ),
    dcc.Graph(id="sales_weather_chart"),
])

@app.callback(
    Output("sales_weather_chart", "figure"),
    Input("store-dropdown", "value")
)
def update_chart(selected_location):
    df = get_combined_data()
    filtered_df = df[df["store_location"] == selected_location]
    fig = px.scatter(filtered_df, x="sales_amount", y="temp_c", color="store_location",
                     title=f"Sales vs Temperature in {selected_location}")
    return fig

if __name__ == "__main__":
    app.run_server(debug=True)


<IPython.core.display.Javascript object>