In [1]:
import os
from langchain_groq import ChatGroq
from dotenv import load_dotenv

from langchain_core.tools import tool
import pandas as pd
from langgraph.prebuilt import create_react_agent
from pydantic.v1 import BaseModel, Field

from IPython.display import Markdown

In [2]:
load_dotenv()

os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")

In [3]:
config = {
    "master_data" : r"C:\Users\nigam\OneDrive\Documents\university_classes\AutoMMM\data\manual_data.xlsx",
    "sheet_name" : "master_data"
}

# Utility Functions

In [4]:
def print_stream(stream):
    for s in stream:
        message = s["messages"][-1]
        if isinstance(message,tuple):
            print(message)
        else:
            message.pretty_print()
            

In [5]:
def process_config(config: dict):
    master_data = pd.read_excel(config['master_data'],sheet_name=config['sheet_name'])

    return {
        'master_data' : master_data
    }

In [6]:
configuration = process_config(config)

configuration['master_data'].head(3)

Unnamed: 0,date,sku,sales,units,price,oos,events,product_level_branded_clicks,product_level_branded_spends,product_level_nonbranded_clicks,product_level_nonbranded_spends,brand_level_branded_clicks,brand_level_branded_spends,brand_level_nonbranded_clicks,brand_level_nonbranded_spends,insta_clicks,insta_spends,fb_clicks,fb_spends
0,2025-06-07,sku_a,50700.046737,626,80.99049,0,0,815,1625.274077,458,1371.344205,313,1876.185014,601,3001.514994,0,0.0,0,0.0
1,2025-06-14,sku_a,49697.711906,620,80.1576,7,0,819,1654.917942,462,1395.543454,305,1226.300332,618,3720.76592,0,0.0,0,0.0
2,2025-06-21,sku_a,53299.8549,659,80.879901,0,0,813,1664.302327,455,1841.436112,313,1266.746161,610,3688.738524,0,0.0,263,1327.390544


In [7]:
llm = ChatGroq(
    model_name="gemma2-9b-it", # llama3-70b-8192, llama-3.3-70b-versatile
    temperature=0.3
)

In [8]:
llm.invoke("hi!")

AIMessage(content='Hi! 👋  How can I help you today?\n', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 11, 'total_tokens': 25, 'completion_time': 0.025454545, 'prompt_time': 0.001934945, 'queue_time': 0.16055047, 'total_time': 0.02738949}, 'model_name': 'gemma2-9b-it', 'system_fingerprint': 'fp_10c08bf97d', 'finish_reason': 'stop', 'logprobs': None}, id='run--ca3ac34c-be17-4145-934f-e6d9907d0b05-0', usage_metadata={'input_tokens': 11, 'output_tokens': 14, 'total_tokens': 25})

In [9]:
configuration['master_data'].head()

Unnamed: 0,date,sku,sales,units,price,oos,events,product_level_branded_clicks,product_level_branded_spends,product_level_nonbranded_clicks,product_level_nonbranded_spends,brand_level_branded_clicks,brand_level_branded_spends,brand_level_nonbranded_clicks,brand_level_nonbranded_spends,insta_clicks,insta_spends,fb_clicks,fb_spends
0,2025-06-07,sku_a,50700.046737,626,80.99049,0,0,815,1625.274077,458,1371.344205,313,1876.185014,601,3001.514994,0,0.0,0,0.0
1,2025-06-14,sku_a,49697.711906,620,80.1576,7,0,819,1654.917942,462,1395.543454,305,1226.300332,618,3720.76592,0,0.0,0,0.0
2,2025-06-21,sku_a,53299.8549,659,80.879901,0,0,813,1664.302327,455,1841.436112,313,1266.746161,610,3688.738524,0,0.0,263,1327.390544
3,2025-06-28,sku_a,52605.985586,654,80.437287,0,0,809,1677.516383,453,1845.326232,313,1901.026734,608,3692.729247,0,0.0,254,1034.686231
4,2025-07-05,sku_a,53098.330692,657,80.819377,0,0,811,1703.118934,453,1857.310576,306,1254.607144,614,3745.414334,0,0.0,267,1895.706233


In [10]:
import json
from pathlib import Path
import pandas as pd
from ydata_profiling import ProfileReport


df = pd.read_excel(config['master_data'],sheet_name=config['sheet_name'])

ModuleNotFoundError: No module named 'ydata_profiling'

In [23]:
profile = ProfileReport(df[df['sku'] == 'sku_a'])

In [None]:
profile

In [None]:
profile = df.profile_report(
    variables={
        "descriptions": {
            "date" : "Date column",
            "sku"	: "product identification number (stock keeping unit)",
            "sales" : "amount of units sold (revenue in euros)",
            "units" : "Number of units sold",
            "price" : "average price of the product in the week",
            "oos"	: "number of days the product was out of stock in a week",
            "events" : "number of events in a week",
            "product_level_branded_clicks" : "clicks generated by product based search in a week (Branded keywords) ",
            "product_level_branded_spends" : "amount of money spent on clicks generated by product based search in the week (Branded keywords)",
            "product_level_nonbranded_clicks" : "clicks generated by product based search in the week (Non - Branded keywords) ",
            "product_level_nonbranded_spends" : "amount of money spent on clicks generated by product based search in the week (Non - Branded keywords) ",
            "brand_level_branded_clicks" : "clicks generated by brand based search in the week (Branded keywords) ",
            "brand_level_branded_spends" : "amount of money spent on clicks generated by product based search in the week (Non - Branded keywords) ",
            "brand_level_nonbranded_clicks" : "clicks generated by brand based search in the week (Branded keywords) ",
            "brand_level_nonbranded_spends" : "amount of money spent on clicks generated by product based search in the week (Non - Branded keywords) ",
            "insta_clicks" : "clicks generated on instagram advertizement in the week.",
            "insta_spends" : "amount of money spent on clicks generated on instagram advertizement in the week.",
            "fb_clicks" : "clicks generated on facebook advertizement in the week.",
            "fb_spends" : "amount of money spent on clicks generated on facebook advertizement in the week."
                }
    }
    )

profile.to_file(Path("stata_auto_report.html"))

In [None]:
profile.to_file('report.html')

In [None]:
json_data = profile.to_json()

In [None]:
profile.to_file("your_report.json")

In [52]:
@tool
def load_dataframe(df):
    """Load the dataframe in the context"""
    return configuration['master_data']

In [53]:
from urllib.parse import quote
from typing import Annotated, List
from typing_extensions import TypedDict
import operator
from langgraph.constants import Send
from langgraph.graph import MessagesState
from langchain_core.messages import SystemMessage, HumanMessage, ToolMessage

In [54]:
@tool
def QuickChartTool(chart_config):
    """Generates a chart image URL from the provided Chart.js configuration."""
    chart_config_str = str(chart_config)  # ensure input is string
    url = f"https://quickchart.io/chart?c={quote(chart_config_str)}"

In [55]:
# Time vs clicks
# spends table

In [75]:
@tool
def compute_summary_statistics(data : pd.DataFrame, columns: list) -> str:
    """Compute summary statistics for specified columns in the dataset."""
    stats = data[columns].describe().to_dict()
    return str(stats)

In [None]:
llm_with_tool = llm.bind_tools([compute_summary_statistics])

In [58]:
class Summary(BaseModel):
    purpose : str = Field(description="Summarize key findings from the raw data review to ensure data readiness for Market Mix Modeling, for the given data")
    observations : str = Field(description="""
                               Overview of data quality, completeness, and potential issues.
                               High-level insights on correlations, trends, outliers, and anomalies.
                               Recommendations for data preparation and treatment, for the given data.""")
    

    

In [77]:
class Overview(BaseModel):
    data_sources : str = Field(description="""
                               List of data sources (e.g., sales data, media spend, macroeconomic indicators).
                               Time period covered (e.g., January 2020 - December 2024).
                               Granularity (e.g., weekly, monthly, regional).""")
    observations : str = Field(description="""
                               **Target Variable**: Description of the target KPI (e.g., sales revenue, units sold).
                               **Independent Variables**: Media spends (TV, digital, print), promotions, pricing, macroeconomic factors, etc.""")
    
    data_volume : str = Field(description="Number of observations, variables, and records per product/region.")

In [78]:
class DescriptiveStats(BaseModel):
    summary_stats : str = Field(description="""
                                Mean, median, standard deviation, min, max for key variables (e.g., sales, media spends).
                                Distribution characteristics (e.g., skewness, kurtosis).""")
    market_share_analysis : str = Field(description="""
                                Market share by product/brand over time (percentage of total sales).
                                Visual: Pie chart or stacked bar chart showing market share distribution.""")
    
    spend_distribution : str = Field(description="""
                                Breakdown of media and promotional spends by channel (e.g., TV, digital, radio) in tabular form.
                                Visual: Bar chart or histogram of spend distribution across channels.
                                Insights: Identify dominant channels and variability in spend patterns.""")

In [79]:
class State(TypedDict):
    data : pd.DataFrame
    combined_summary : str
    combined_overview : str
    combined_stats : str
    combined : str

In [80]:
def write_summary(state: State):
    summary_report =  llm_with_tool.with_structured_output(Summary).invoke(f"use appropriate tool to write a detailed summary for the data : data : {state['data']}")
    purpose = summary_report.purpose
    observations = summary_report.observations
    combined_summary = f"""
## purpose 
{purpose} 
## obeservations 
{observations}
                    """
    return {'combined_summary' : combined_summary}

In [81]:
def write_overview(state: State):
    overview_report =  llm_with_tool.with_structured_output(Overview).invoke(f"{state['data']}")
    data_sources = overview_report.data_sources
    observations = overview_report.observations
    data_volume = overview_report.data_volume
    combined_overview = f"""
## data_sources 
{data_sources} 
## observations 
{observations}
## data_volume 
{data_volume}
                    """
    return {'combined_overview' : combined_overview}

In [82]:
def write_descriptive_stats(state: State):
    stats_report =  llm_with_tool.with_structured_output(DescriptiveStats).invoke(f"{state['data']}")
    summary_stats = stats_report.summary_stats
    market_share_analysis = stats_report.market_share_analysis
    spend_distribution = stats_report.spend_distribution
    combined_stats = f"""
## summary_stats 
{summary_stats} 
## market_share_analysis 
{market_share_analysis}
## spend_distribution 
{spend_distribution}
                    """
    return {'combined_stats' : combined_stats}

In [83]:
def aggregator(state: State):
    """ Combine every thing into a single format"""
    combined_text = "Here is the combined output : \n"
    combined_text += f"combined_summary: \n {state['combined_summary']}" 
    combined_text += f"combined_overview: \n {state['combined_overview']}" 
    combined_text += f"combined_stats: \n {state['combined_stats']}" 
    return {'combined': combined_text}

In [84]:
from langgraph.graph import StateGraph, START, END
from IPython.display import Image, display

In [None]:
parallel_builder = StateGraph(State)

parallel_builder.add_node("write_summary",write_summary)
parallel_builder.add_node("write_overview",write_overview)
parallel_builder.add_node("write_descriptive_stats",write_descriptive_stats)
parallel_builder.add_node("aggregator",aggregator)


parallel_builder.add_edge(START, "write_summary")
parallel_builder.add_edge(START, "write_overview")
parallel_builder.add_edge(START, "write_descriptive_stats")

parallel_builder.add_edge("write_summary", "aggregator")
parallel_builder.add_edge("write_overview", "aggregator")
parallel_builder.add_edge("write_descriptive_stats", "aggregator")

parallel_builder.add_edge("aggregator",END)

parallel_workflow = parallel_builder.compile()

display(Image(parallel_workflow.get_graph().draw_mermaid_png()))

In [86]:
def print_stream(stream):
    for s in stream:
        d = s[list(s.keys())[0]]
        message = d[list(d.keys())[0]]
        print(message)
            

In [87]:
output = parallel_workflow.invoke({"data":configuration['master_data']})

In [None]:
Markdown(output['combined_summary'])

In [None]:
print_stream(parallel_workflow.stream({"data":configuration['master_data']}))

In [None]:
for s in :
    print(s)

    # list(s.keys())[0]

    d = s[list(s.keys())[0]]

    print(d[list(d.keys())[0]])

    
    s[list(s.keys())[0]][list(s.keys())[0]]
    import pdb; pdb.set_trace()

In [None]:
print_stream(parallel_workflow.stream({"data":configuration['master_data']}))
    # s[list(s.keys())[0]]
# inputs = {"messages": [("user", "explain the data")]}
# print_stream(data_analysis_agent.stream(inputs, stream_mode="values"))

In [None]:
Markdown(output['combined'])

In [172]:
class Section(BaseModel):
    name: str = Field(description="The title of the section of the report")
    description : str = Field(description="The detailed explaination of the observed or analysed number")

class Sections(BaseModel):
    sections : list[Section] = Field(description="list of all the sections")

planner = llm.with_structured_output(Sections)

In [173]:
class State(TypedDict):
    topic: str
    sections : List[Section]
    completed_sections : Annotated[
        List, operator.add
    ]
    final_report : str

class AgentState(TypedDict):
    section : Section
    completed_sections: Annotated[
        List, operator.add
        ]


def orchestrator(state: State):
    planning = planner.invoke(
        [
            SystemMessage(content = "Plan the report into subsections based on the topic."),
            HumanMessage(content = f"Here is the topic : {state['topic']}")
        ]
    )
    return {'sections': planning.sections}

def agents(state: AgentState):
    contnt = llm.invoke(
        [
            SystemMessage(content = "Generate a report section without preamble base on the given topic"),
            HumanMessage(content = f"The name : {state['section'].name}, and the description is : {state['section'].description}")
        ]
    )
    return {"completed_sections": [contnt.content]}

def agent_handler(state: State):
    return [Send("agents",{"section": s}) for s in state['sections']]

def synthesizer(state: State):
    combined_sections = state['completed_sections']
    final_report = "\n\n --- \n\n".join(combined_sections)

    return {'final_report': final_report}

In [26]:
from langgraph.graph import StateGraph, START, END
from IPython.display import Image, display

In [None]:
orchestrator_flow_builder = StateGraph(State)

orchestrator_flow_builder.add_node("orchestrator",orchestrator)
orchestrator_flow_builder.add_node("agents",agents)
orchestrator_flow_builder.add_node("synthesizer",synthesizer)

orchestrator_flow_builder.add_edge(START, 'orchestrator')
orchestrator_flow_builder.add_conditional_edges(
    'orchestrator',
    agent_handler,
    ["agents"]
)
orchestrator_flow_builder.add_edge("agents","synthesizer")
orchestrator_flow_builder.add_edge("synthesizer",END)


orchestrator_workflow = orchestrator_flow_builder.compile()

display(Image(orchestrator_workflow.get_graph().draw_mermaid_png()))

In [176]:
prompt = """
You are a senior data analyst specializing in Marketing Mix Modeling (MMM).
Your expertise includes:

- Decomposing sales into base, incremental, and external drivers
- ROI analysis of marketing levers (TV, digital, price cuts, etc.)
- Trend, seasonality, and anomaly detection
- Communicating insights clearly to business and marketing stakeholders
- Visualizing insights using QuickChart (https://quickchart.io/)
- Producing accurate reports in LaTeX format, suitable for PDF compilation


## Report Format 

### Market Mix Modeling: Pre-Modeling Raw Data Review Report

#### 1. Executive Summary
- **Purpose**: Summarize key findings from the raw data review to ensure data readiness for Market Mix Modeling.
- **Key Observations**:
Overview of data quality, completeness, and potential issues.
High-level insights on correlations, trends, outliers, and anomalies.
Recommendations for data preparation and treatment.

#### 2. Data Overview
- **Data Sources**:
List of data sources (e.g., sales data, media spend, macroeconomic indicators).
Time period covered (e.g., January 2020 - December 2024).
Granularity (e.g., weekly, monthly, regional).
- **Key Variables**:
**Target Variable**: Description of the target KPI (e.g., sales revenue, units sold).
**Independent Variables**: Media spends (TV, digital, print), promotions, pricing, macroeconomic factors, etc.
- **Data Volume**:
Number of observations, variables, and records per product/region.

#### 3. Descriptive Statistics
- **Summary Statistics**:
Mean, median, standard deviation, min, max for key variables (e.g., sales, media spends).
Distribution characteristics (e.g., skewness, kurtosis).
- **Market Share Analysis**:
Market share by product/brand over time (percentage of total sales).
Visual: Pie chart or stacked bar chart showing market share distribution.
- **Spend Distribution**:
Breakdown of media and promotional spends by channel (e.g., TV, digital, radio).
Visual: Bar chart or histogram of spend distribution across channels.
Insights: Identify dominant channels and variability in spend patterns.

#### 4. Correlation Analysis
- **Inter-KPI Correlation**:
Correlation matrix of independent variables (e.g., TV spend vs. digital spend).
Visual: Heatmap of correlations to identify multicollinearity risks.
Insights: Highlight high correlations (>0.7) that may require variable reduction or transformation.
- **Correlation with Target**:
Correlation coefficients between each independent variable and the target (e.g., sales).
Visual: Bar chart ranking variables by correlation strength.
Insights: Identify key drivers and weak predictors.

#### 5. Trend Analysis
- **Time Series Trends**:
Trends for target variable and key independent variables over time.
Visual: Line charts showing trends for sales, media spends, and promotions.
Insights: Identify seasonality, cyclical patterns, or structural breaks.
- **Product-Level Trends**:
Trends in sales and market share by product.
Visual: Stacked line charts or area charts for product-level trends.

#### 6. Outlier Identification
- **Methodology**:
Statistical methods used (e.g., Z-score, IQR, or domain-specific thresholds).
Example: Z-score > 3 or < -3 for sales or spend data.
- **Findings**:
List of outliers by variable (e.g., unusually high TV spend in Q3 2023).
Visual: Box plots or scatter plots highlighting outliers.
- **Impact Assessment**:
Potential impact of outliers on model performance.
Recommendations: Winsorization, capping, or removal.

#### 7. Missing Data Analysis
- **Missing Data Summary**:
Percentage of missing values by variable and time period.
Visual: Heatmap or bar chart of missing data patterns.
- **Patterns**:
Random vs. systematic missingness (e.g., missing digital spend data for specific regions).
- **Recommendations**:
Imputation methods (e.g., mean/median imputation, time-series interpolation).
Exclusion of variables with excessive missingness (>30%).

#### 8. Anomaly Detection
- **Methodology**:
Techniques used (e.g., statistical thresholds, clustering, or machine learning-based anomaly detection).
- **Findings**:
Specific anomalies (e.g., sudden sales spike in a region unrelated to marketing activity).
Visual: Time-series plots with flagged anomalies.
- **Potential Causes**:
Data entry errors, external events, or untracked campaigns.
- **Recommendations**:
Investigate anomalies with stakeholders.
Adjust data or include dummy variables for known events.

#### 9. Data Quality Issues and Treatment Suggestions
- **Summary of Issues**:
Multicollinearity, outliers, missing data, anomalies, or inconsistent granularity.
- **Proposed Treatments**:
**Multicollinearity**: Combine correlated variables or use PCA.
**Outliers**: Winsorize, cap, or remove based on domain knowledge.
**Missing Data**: Impute using time-series methods or exclude problematic variables.
**Anomalies**: Add dummy variables for known events or remove erroneous records.
**Normalization**: Scale variables (e.g., log transformation for skewed spends).
- **Next Steps**:
Validate treatments with stakeholders.
Prepare cleaned dataset for modeling.

#### 10. Appendix
- **Data Dictionary**:
Definitions and sources for all variables.
- **Additional Visualizations**:
Detailed charts or tables (e.g., raw data samples, additional trend plots).
- **Technical Notes**:
Software/tools used (e.g., Python, R, Excel for analysis).
Code snippets for key analyses (e.g., correlation matrix, outlier detection).

## Instructions

1. Data Exploration Phase
   - Load dataset using `load_dataframe`
   - Identify the dependent variable (e.g., sales)
   - Categorize independent variables as base, incremental, or external

2. Visualization Phase (Use QuickChart)
   - Generate chart URLs using the `QuickChartTool` by passing Chart.js JSON configs.
   - Embed these charts in LaTeX using the following syntax:
     \\
     \\begin{figure}[H]
     \\centering
     \\includegraphics[width=\\linewidth]{{<chart_url>}}
     \\caption{<caption_text>}
     \\end{figure}
   - Use these charts where relevant:
     - Line chart for sales trends over time
     - Bar chart for driver contributions
     - Pie chart for driver share breakdown
     - Heatmap-like correlation matrix (mock with bar/stacked bar if needed)
     - Scatter charts for bivariate analysis (optional with annotations)
     - Bar chart of missing values by column

3. Insight Generation Phase
   - Explain all chart outputs clearly in LaTeX text
   - Identify missing values, correlations, lag/adstock effects
   - Quantify driver impact
   - Highlight anomalies (peaks, dips, or seasonal trends)
   - Recommend treatment for missing data and campaign gaps

4. Report Writing Phase
   - Use LaTeX sectioning commands (\\section, \\subsection, etc.) to structure the report
   - Embed charts as figures with captions as shown above
   - Avoid hallucination or assumption — base all insights on real data
   - Provide concise bullet points using LaTeX itemize environment

## Expected Output

\\section*{Executive Summary}
- Key sales trends
- Major drivers (TV, price, etc.)
- Seasonal effects or spikes
- Anomalies or potential data issues

\\section*{Variable Classification}
- Table or list of variables: base, incremental, external with reasoning

\\section*{Data Summary}
- Describe each column and its role
- Mention missing data:
  - Count & % per column
  - Likely causes (e.g., campaign gaps, bad logging)
  - Recommended treatment (e.g., fill with 0, interpolate, drop)

\\section*{Univariate Analysis}
- Distribution patterns for each variable
- Include bar or pie charts with QuickChart embedded as figures

\\section*{Bivariate & Correlation Analysis}
- Explain relationships between drivers and sales
- Describe strength, direction, and potential multicollinearity

\\section*{Adstock & Lag Effects}
- Describe persistence/memory effects of media
- Explain where diminishing returns or delay patterns exist

\\section*{Outlier & Peak Analysis}
- Describe any anomalies
- Hypothesize missed events, overspend, or missing data

\\section*{Business Recommendations}
- Spend allocation advice (e.g., increase digital in Q2)
- Suggestions for data quality improvement
- Notes for modeling enhancements (e.g., separate halo/cannibal effects)

---
Analysis conducted by: MMM Agent \\\\
Report generated on: {current_date}

Use the following tools as needed: {tools}

Question: {input}
Thought: {agent_scratchpad}
Action: {action}
Action Input: {action_input}
Observation: {observation}
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: {final_answer}

"""


In [182]:
prompt = f"""

context : {configuration['master_data']}

## Market Mix Modeling: Pre-Modeling Raw Data Review Report

#### 1. Executive Summary
- **Purpose**: Summarize key findings from the raw data review to ensure data readiness for Market Mix Modeling.
- **Key Observations**:
Overview of data quality, completeness, and potential issues.
High-level insights on correlations, trends, outliers, and anomalies.
Recommendations for data preparation and treatment.

#### 2. Data Overview
- **Data Sources**:
List of data sources (e.g., sales data, media spend, macroeconomic indicators).
Time period covered (e.g., January 2020 - December 2024).
Granularity (e.g., weekly, monthly, regional).
- **Key Variables**:
**Target Variable**: Description of the target KPI (e.g., sales revenue, units sold).
**Independent Variables**: Media spends (TV, digital, print), promotions, pricing, macroeconomic factors, etc.
- **Data Volume**:
Number of observations, variables, and records per product/region.

#### 3. Descriptive Statistics
- **Summary Statistics**:
Mean, median, standard deviation, min, max for key variables (e.g., sales, media spends).
Distribution characteristics (e.g., skewness, kurtosis).
- **Market Share Analysis**:
Market share by product/brand over time (percentage of total sales).
Visual: Pie chart or stacked bar chart showing market share distribution.
- **Spend Distribution**:
Breakdown of media and promotional spends by channel (e.g., TV, digital, radio).
Visual: Bar chart or histogram of spend distribution across channels.
Insights: Identify dominant channels and variability in spend patterns.

#### 4. Correlation Analysis
- **Inter-KPI Correlation**:
Correlation matrix of independent variables (e.g., TV spend vs. digital spend).
Visual: Heatmap of correlations to identify multicollinearity risks.
Insights: Highlight high correlations (>0.7) that may require variable reduction or transformation.
- **Correlation with Target**:
Correlation coefficients between each independent variable and the target (e.g., sales).
Visual: Bar chart ranking variables by correlation strength.
Insights: Identify key drivers and weak predictors.

#### 5. Trend Analysis
- **Time Series Trends**:
Trends for target variable and key independent variables over time.
Visual: Line charts showing trends for sales, media spends, and promotions.
Insights: Identify seasonality, cyclical patterns, or structural breaks.
- **Product-Level Trends**:
Trends in sales and market share by product.
Visual: Stacked line charts or area charts for product-level trends.

#### 6. Outlier Identification
- **Methodology**:
Statistical methods used (e.g., Z-score, IQR, or domain-specific thresholds).
Example: Z-score > 3 or < -3 for sales or spend data.
- **Findings**:
List of outliers by variable (e.g., unusually high TV spend in Q3 2023).
Visual: Box plots or scatter plots highlighting outliers.
- **Impact Assessment**:
Potential impact of outliers on model performance.
Recommendations: Winsorization, capping, or removal.

#### 7. Missing Data Analysis
- **Missing Data Summary**:
Percentage of missing values by variable and time period.
Visual: Heatmap or bar chart of missing data patterns.
- **Patterns**:
Random vs. systematic missingness (e.g., missing digital spend data for specific regions).
- **Recommendations**:
Imputation methods (e.g., mean/median imputation, time-series interpolation).
Exclusion of variables with excessive missingness (>30%).

#### 8. Anomaly Detection
- **Methodology**:
Techniques used (e.g., statistical thresholds, clustering, or machine learning-based anomaly detection).
- **Findings**:
Specific anomalies (e.g., sudden sales spike in a region unrelated to marketing activity).
Visual: Time-series plots with flagged anomalies.
- **Potential Causes**:
Data entry errors, external events, or untracked campaigns.
- **Recommendations**:
Investigate anomalies with stakeholders.
Adjust data or include dummy variables for known events.

#### 9. Data Quality Issues and Treatment Suggestions
- **Summary of Issues**:
Multicollinearity, outliers, missing data, anomalies, or inconsistent granularity.
- **Proposed Treatments**:
**Multicollinearity**: Combine correlated variables or use PCA.
**Outliers**: Winsorize, cap, or remove based on domain knowledge.
**Missing Data**: Impute using time-series methods or exclude problematic variables.
**Anomalies**: Add dummy variables for known events or remove erroneous records.
**Normalization**: Scale variables (e.g., log transformation for skewed spends).
- **Next Steps**:
Validate treatments with stakeholders.
Prepare cleaned dataset for modeling.

#### 10. Appendix
- **Data Dictionary**:
Definitions and sources for all variables.
- **Additional Visualizations**:
Detailed charts or tables (e.g., raw data samples, additional trend plots).
- **Technical Notes**:
Software/tools used (e.g., Python, R, Excel for analysis).
Code snippets for key analyses (e.g., correlation matrix, outlier detection)."""

In [None]:
state = orchestrator_workflow.invoke({"topic": prompt})
state

In [31]:
from IPython.display import Markdown

# Markdown(state['final_report'])

In [28]:
data_analysis_agent = create_react_agent(
    model = llm,
    tools = [load_dataframe, QuickChartTool],
    prompt = prompt,
    name = "data_analysis_agent",
)

In [29]:
#prompt
# task
# context
# example
# persona
# format
# tone

In [None]:
inputs = {"messages": [("user", "explain the data")]}
# print_stream(data_analysis_agent.stream(inputs, stream_mode="values"))
data_analysis_agent.invoke(inputs)

In [None]:
no_of_weeks = 104

In [1]:
# #TODO: 
# 1. RAG load - https://www.latentview.com/wp-content/uploads/2019/08/Marketing-Mix-Model.pdf
# 2. Generate prompt
# 3. react agent to dataAnalysis
# 4. crewai


In [None]:
from crewai_tools import RagTool

# Create a RAG tool with default settings
rag_tool = RagTool()

# Add content from a file
rag_tool.add(data_type="file", path="path/to/your/document.pdf")

# Add content from a web page
# rag_tool.add(data_type="web_page", url="https://example.com")
# https://develop.nielsen.com/wp-content/uploads/sites/2/2019/04/marketing-mix-modeling-what-marketers-need-to-know.pdf

# Define an agent with the RagTool
@agent
def knowledge_expert(self) -> Agent:
    '''
    This agent uses the RagTool to answer questions about the knowledge base.
    '''
    return Agent(
        config=self.agents_config["knowledge_expert"],
        allow_delegation=False,
        tools=[rag_tool]
    )