In [57]:
# Code in cell 1
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import openai

# Declare `data` as global
data = None  # Initialize global variable
Analysis = None
visualizations = None

def load_data_colab():
    from google.colab import files
    print("Please upload your dataset:")
    uploaded = files.upload()  # Prompts file upload
    if len(uploaded) == 0:
        print("No file uploaded.")
        return None
    filename = list(uploaded.keys())[0]
    return filename

def main():
    global data  # Declare `data` as global to share it between steps

    # Load the dataset in Colab
    input_file = load_data_colab()
    if not input_file:
        return

    # Load the dataset
    try:
        data = pd.read_csv(input_file, encoding='latin-1')  # or 'cp1252'
        print(f"Successfully loaded '{input_file}' with {data.shape[0]} rows and {data.shape[1]} columns.")
    except Exception as e:
        print(f"Error loading the file: {e}")
        return

    # Basic dataset overview
    print("Dataset Overview:")
    print(data.info())
    print(data.head())

if __name__ == "__main__":
    main()

Please upload your dataset:


Saving happiness.csv to happiness.csv
Successfully loaded 'happiness.csv' with 2363 rows and 11 columns.
Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2363 entries, 0 to 2362
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      2363 non-null   object 
 1   year                              2363 non-null   int64  
 2   Life Ladder                       2363 non-null   float64
 3   Log GDP per capita                2335 non-null   float64
 4   Social support                    2350 non-null   float64
 5   Healthy life expectancy at birth  2300 non-null   float64
 6   Freedom to make life choices      2327 non-null   float64
 7   Generosity                        2282 non-null   float64
 8   Perceptions of corruption         2238 non-null   float64
 9   Positive affect                   2339 non-null   float64
 10  Negative 

In [58]:
 analysis = {}
def analyze_data(data):
    """
    Perform basic analysis on the dataset and return a summary.
    """


    # Dataset shape
    analysis["shape"] = data.shape

    # Data types and counts
    analysis["data_types"] = data.dtypes.to_dict()

    # Summary statistics
    analysis["summary_statistics"] = data.describe(include="all").to_dict()

    # Count missing values per column
    analysis["missing_values"] = data.isnull().sum().to_dict()

    # Preview of the data
    analysis["data_preview"] = data.head(5).to_dict(orient="records")

    return analysis


def main(data):
    if data is None:
        print("No data provided for analysis.")
        return

    # Perform basic analysis
    analysis = analyze_data(data)

    # Print analysis summary
    print("\n--- Dataset Analysis ---")
    print(f"Shape: {analysis['shape']}")
    print(f"Data Types: {analysis['data_types']}")
    print(f"Missing Values: {analysis['missing_values']}")
    print("\nSummary Statistics:")
    print(pd.DataFrame(analysis["summary_statistics"]))
    print("\nData Preview:")
    for row in analysis["data_preview"]:
        print(row)

# Continuation of Step 1
if __name__ == "__main__":
    try:
        # Check if `data` exists from Step 1
        main(data)  # Use the `data` variable from Step 1
    except NameError:
        print("Error: No dataset loaded. Please ensure Step 1 has been executed, and the `data` variable exists.")



--- Dataset Analysis ---
Shape: (2363, 11)
Data Types: {'Country name': dtype('O'), 'year': dtype('int64'), 'Life Ladder': dtype('float64'), 'Log GDP per capita': dtype('float64'), 'Social support': dtype('float64'), 'Healthy life expectancy at birth': dtype('float64'), 'Freedom to make life choices': dtype('float64'), 'Generosity': dtype('float64'), 'Perceptions of corruption': dtype('float64'), 'Positive affect': dtype('float64'), 'Negative affect': dtype('float64')}
Missing Values: {'Country name': 0, 'year': 0, 'Life Ladder': 0, 'Log GDP per capita': 28, 'Social support': 13, 'Healthy life expectancy at birth': 63, 'Freedom to make life choices': 36, 'Generosity': 81, 'Perceptions of corruption': 125, 'Positive affect': 24, 'Negative affect': 16}

Summary Statistics:
       Country name         year  Life Ladder  Log GDP per capita  \
count          2363  2363.000000  2363.000000         2335.000000   
unique          165          NaN          NaN                 NaN   
top       

In [59]:
import matplotlib.pyplot as plt
import seaborn as sns
visualizations = []
def create_visualizations(data, output_prefix="analysis"):
    """
    Create basic visualizations for the dataset.
    """


    # Missing values heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(data.isnull(), cbar=False, cmap="viridis")
    missing_values_file = f"{output_prefix}_missing_values.png"
    plt.title("Missing Values Heatmap")
    plt.savefig(missing_values_file)
    plt.close()
    visualizations.append(missing_values_file)

    # Pairplot for numeric data (if feasible)
    numeric_cols = data.select_dtypes(include=["number"]).columns
    if len(numeric_cols) > 1:
        sns.pairplot(data[numeric_cols].dropna())
        pairplot_file = f"{output_prefix}_pairplot.png"
        plt.savefig(pairplot_file)
        plt.close()
        visualizations.append(pairplot_file)

    # Correlation heatmap (if numeric columns exist)
    if len(numeric_cols) > 1:
        plt.figure(figsize=(10, 6))
        correlation_matrix = data[numeric_cols].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
        correlation_file = f"{output_prefix}_correlation.png"
        plt.title("Correlation Heatmap")
        plt.savefig(correlation_file)
        plt.close()
        visualizations.append(correlation_file)

    return visualizations


def main(data):
    """
    Main function for Step 3: Create visualizations.
    """
    if data is None:
        print("No data provided for visualization. Please ensure Step 1 has been executed.")
        return

    # Create visualizations
    print("\n--- Generating Visualizations ---")
    output_prefix = "analysis"  # Default output prefix for saved visualizations
    visualizations = create_visualizations(data, output_prefix=output_prefix)

    for viz in visualizations:
        print(f"Saved visualization: {viz}")


# Continuation of previous steps
if __name__ == "__main__":
    try:
        # Check if `data` exists from Step 1
        main(data)  # Use the `data` variable from Step 1
    except NameError:
        print("Error: No dataset loaded. Please ensure Step 1 has been executed, and the `data` variable exists.")





--- Generating Visualizations ---
Saved visualization: analysis_missing_values.png
Saved visualization: analysis_pairplot.png
Saved visualization: analysis_correlation.png


In [60]:
import os
import pandas as pd
import httpx
import time

# Constants for API
API_URL = "https://aiproxy.sanand.workers.dev/openai/v1/chat/completions"
AIPROXY_TOKEN = "API Token Enter"

def query_llm_with_httpx(prompt, model="gpt-4o-mini", max_retries=5, retry_delay=30):
    """
    Query the LLM using the AI Proxy with httpx, with retry logic for rate limiting.
    """
    headers = {
        "Authorization": f"Bearer {AIPROXY_TOKEN}",
        "Content-Type": "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": "You are a helpful data analysis assistant."},
            {"role": "user", "content": prompt},
        ],
    }

    for attempt in range(max_retries):
        try:
            response = httpx.post(API_URL, headers=headers, json=payload, timeout=30.0)
            response.raise_for_status()
            return response.json()["choices"][0]["message"]["content"]
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 429:
                print(f"Rate limit exceeded. Retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{max_retries})")
                time.sleep(retry_delay)
            else:
                print(f"HTTP error occurred: {e}")
                break
        except httpx.RequestError as e:
            print(f"Request error occurred: {e}")
            break
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            break
    return None


def generate_markdown_story(analysis, visualizations):
    """
    Use the LLM to generate a Markdown story summarizing the analysis and insights.
    """
    if not analysis or not isinstance(visualizations, list) or len(visualizations) == 0:
        print("Error: Invalid analysis or visualizations provided.")
        return None

    # Prepare a concise summary to send to the LLM
    summary = f"""
    Dataset Shape: {analysis['shape']}
    Data Types: {analysis['data_types']}
    Missing Values: {analysis['missing_values']}
    Summary Statistics:
    {pd.DataFrame(analysis['summary_statistics']).to_string()}

    Visualizations:
    - Missing Values Heatmap: {visualizations[0] if len(visualizations) > 0 else 'Not Available'}
    """
    if len(visualizations) > 1:
        summary += f"\n- Pairplot: {visualizations[1]}"
    if len(visualizations) > 2:
        summary += f"\n- Correlation Heatmap: {visualizations[2]}"

    prompt = f"""
    Analyze the following dataset summary and visualizations:
    {summary}

    Write a Markdown narrative that:
    1. Describes the dataset briefly.
    2. Explains the analyses performed.
    3. Highlights key insights.
    4. Suggests implications or next steps based on the findings.
    Include references to the visualizations in your narrative.
    """
    return query_llm_with_httpx(prompt)


def step_4(data, analysis, visualizations):
    """
    Step 4: Generate Markdown narrative using LLM.
    """
    if data is None:
        print("Error: No data provided for Markdown generation. Please ensure Step 1 has been executed.")
        return
    if not analysis or not visualizations:
        print("Error: No analysis or visualizations available. Please ensure Steps 2 and 3 have been executed.")
        return

    print("\n--- Generating Markdown Story ---")
    markdown_story = generate_markdown_story(analysis, visualizations)
    if not markdown_story:
        print("Error generating Markdown story using the LLM.")
        return

    output_markdown_file = "README.md"
    try:
        with open(output_markdown_file, "w") as f:
            f.write(markdown_story)
        print(f"Markdown story saved to {output_markdown_file}.")
    except Exception as e:
        print(f"Error saving Markdown story to file: {e}")


# Example Execution Check
if "data" in globals() and "analysis" in globals() and "visualizations" in globals():
    step_4(data, analysis, visualizations)
else:
    print("Error: Required variables `data`, `analysis`, or `visualizations` are not defined in the global scope.")



--- Generating Markdown Story ---
Markdown story saved to README.md.
