In [2]:
import streamlit as st
import os
import pandas as pd
import altair as alt
from config.path_config import lakefs_s3_path  # your existing path configuration

# Sidebar selections
page = st.sidebar.selectbox("Overview", ["data", "Machine Learning"])
use_bar_chart = st.sidebar.checkbox("Bar", key="bar")

# Define a function to load data from lakeFS
def data_from_lakefs(lakefs_endpoint: str = "http://localhost:8001/"):
    storage_options = {
        "key": os.getenv("ACCESS_KEY"),
        "secret": os.getenv("SECRET_KEY"),
        "client_kwargs": {
            "endpoint_url": lakefs_endpoint
        }
    }
    df = pd.read_parquet(
        lakefs_s3_path,
        storage_options=storage_options,
        engine='pyarrow',
    )
    return df

# Load data
df = data_from_lakefs()

# (Optional) Display raw data for verification
st.subheader("Raw Data")
st.dataframe(df)

# Convert 'postTimeRaw' to datetime if needed and then format
if not pd.api.types.is_datetime64_any_dtype(df["postTimeRaw"]):
    df["postTimeRaw"] = pd.to_datetime(df["postTimeRaw"])
    
# Save a formatted version of postTimeRaw (if you wish to show only date)
df["postTimeFormatted"] = df["postTimeRaw"].dt.strftime('%Y-%m-%d')

# (Optional) Add an index column if needed
df['index'] = df.index + 1

# If you have a column with tags and you wish to count them,
# adjust the following code if the tags are stored in a list/string form.
if "tags" in df.columns:
    # If tags are stored as list objects in each row
    df["tag_count"] = df["tags"].apply(lambda x: len(x) if isinstance(x, list) else 0)
else:
    # Otherwise, if you wish just to count tweets (one per row), set tag_count=1
    df["tag_count"] = 1

# Build Dashboard (only if "Dashboard" is selected)
if page == "Dashboard":
    st.title("Tweet Timeline Dashboard")
    
    # Group the data by the formatted post time, summing the tag counts (or counting rows)
    timeline = df.groupby("postTimeFormatted")["tag_count"].sum().reset_index()
    timeline.columns = ["Date", "Tweet Count"]
    
    st.subheader("Tweet Count Over Time")
    
    # Create a chart using Altair
    if use_bar_chart:
        chart = alt.Chart(timeline).mark_bar().encode(
            x=alt.X("Date:T", title="Date"),
            y=alt.Y("Tweet Count:Q", title="Tweet Count")
        ).properties(width=700, height=400, title="Bar Chart: Tweets per Day")
    else:
        chart = alt.Chart(timeline).mark_line(point=True).encode(
            x=alt.X("Date:T", title="Date"),
            y=alt.Y("Tweet Count:Q", title="Tweet Count")
        ).properties(width=700, height=400, title="Line Chart: Tweets per Day")
    
    st.altair_chart(chart, use_container_width=True)
    st.dataframe(timeline)
    
# (Optional) For the Machine Learning section, add further widgets/code here.
if page == "Machine Learning":
    st.title("Machine Learning")
    st.write("Add your ML related code here.")


2025-05-14 16:40:22.415 
  command:

    streamlit run /Users/use/Documents/dsi321_2025/.venv/lib/python3.13/site-packages/ipykernel_launcher.py [ARGUMENTS]


In [None]:
# import streamlit as st 
# import os
# import pandas as pd 
# import altair as alt
# # Import path configuration
# from config.path_config import lakefs_s3_path

# st.sidebar.selectbox("Data", ["Dashboard", "Machine Learning"], key="Overview")
# st.sidebar.checkbox("Bar", key="bar")


# def data_from_lakefs(lakefs_endpoint: str = "http://localhost:8001/"):
#     storage_options = {
#         "key": os.getenv("ACCESS_KEY"),
#         "secret": os.getenv("SECRET_KEY"),
#         "client_kwargs": {
#             "endpoint_url": lakefs_endpoint
#         }
#     }
#     df = pd.read_parquet(
#         lakefs_s3_path,
#         storage_options=storage_options,
#         engine='pyarrow',
#     )
#     return df

# df = data_from_lakefs()
# st.dataframe(df)
