In [72]:
!pip3 install plotly matplotlib -U google-generativeai pandas ipywidgets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import pandas as pd
import subprocess
import importlib
import traceback
import sys
import re
import os

import google.generativeai as genai
api_key = os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-1.5-flash")

In [2]:
def extract_code(generated_code):
    try:
        code_match = re.search(r"```(?:python)?\n(.*?)```", generated_code, re.DOTALL)
        if code_match:
            return code_match.group(1).strip()
        return generated_code.strip()
    except Exception as e:
        print(f"Error extracting code: {e}")
        return ""

In [3]:
def install_required_packages(generated_code):
    import_statements = re.findall(r'^\s*(?:import|from)\s+(\w+)', generated_code, re.MULTILINE)
    unique_libraries = set(import_statements)

    for library in unique_libraries:

        try:
            # Attempt to import the library
            importlib.import_module(library)
        except ImportError as import_err:
            # Handle module not found by extracting the base name
            base_library = library.split('.')[0]
            print(f"Attempting to install '{base_library}' as '{library}' was not found...")

            try:
                subprocess.check_call([sys.executable, "-m", "pip", "install", base_library])
                print(f"'{base_library}' successfully installed.")
            except subprocess.CalledProcessError as e:
                print(f"Failed to install '{base_library}': {e}")
            except Exception as e:
                print(f"Unexpected error while installing '{base_library}': {traceback.format_exc()}")

        except ModuleNotFoundError as e:
            print(f"Module '{library}' not found: {e}")
            base_library = library.split('.')[0]
            print(f"Retrying to install '{base_library}' instead...")

            try:
                subprocess.check_call([sys.executable, "-m", "pip", "install", base_library])
                print(f"'{base_library}' successfully installed.")
            except Exception as e:
                print(f"Error installing '{base_library}': {traceback.format_exc()}")

In [4]:
def execute_code_from_response(generated_code):
    try:
        install_required_packages(generated_code)
    except Exception as e:
        print(f"Error installing required packages: {e}")
        return
    
    code = extract_code(generated_code)
    
    # Preparing environment variables
    globals_dict = {"pd": pd}

    try:
        exec(code, globals_dict)
    except SyntaxError as e:
        print(f"Syntax error in the generated code:\n{e}")
        print("Code execution aborted.")
    except Exception as e:
        print(f"Runtime error during code execution:\n{traceback.format_exc()}")
        print("Code execution aborted.")

In [None]:
def generate(csv_data):
    import pandas as pd

    try:
        # Load the CSV data
        try:
            df = pd.read_csv(csv_data)
        except FileNotFoundError:
            print("Error: The file was not found.")
            return
        except pd.errors.EmptyDataError:
            print("Error: The file is empty.")
            return
        except pd.errors.ParserError:
            print("Error: The file could not be parsed. Please check if it's a valid CSV file.")
            return

        # Ensure the DataFrame is not empty
        if df.empty:
            print("Error: The CSV file contains no data.")
            return

        # Get dataset info for better data overview
        data_info = df.info()
        data_desc = df.describe(include='all')
        top_rows = df.head(3).to_dict(orient='records')

        # Extract relevant information for columns
        column_info = []
        for col in df.columns:
            try:
                column_type = str(df[col].dtype)
                unique_count = df[col].nunique()
                column_info.append({
                    "name": col,
                    "type": column_type,
                    "count": unique_count
                })
            except Exception as e:
                print(f"Error processing column '{col}': {e}")
                continue  # Skip any problematic columns

        total_records = len(df)

        # Check if any columns were successfully processed
        if not column_info:
            print("Error: No valid columns found in the dataset.")
            return

        prompt = f"""
            You are provided with a dataset summary containing column names, data types, unique counts, and statistical description:
            Columns: {column_info}
            Total Records: {total_records}
            Data Summary: {data_info}
            Data Statistics: {data_desc}
            Sample Data (Top 3 Rows): {top_rows}

            Write accurate, syntactically correct Python code using Pandas and Plotly that performs the following steps. 
            **Do not hallucinate or add any functionality not explicitly requested**:

            1. **Load and Prepare the Data**:
               - Parse columns based on their types: Convert date columns with `pd.to_datetime`, and numeric columns with `pd.to_numeric`. 
               - Use `df.dtypes` to verify data types.

            2. **Generate Visualizations**:
               - For each data type (categorical, numeric, datetime), create specific Plotly visualizations:
                 - **Categorical Data**: Create a bar chart for top categories with gradient colors and hover tooltips.
                 - **Time-Series Data**: Create an interactive line chart for datetime columns, with point markers and hover details.
                 - **Numeric Correlations**: Generate a scatter plot if two or more numeric columns exist, with gradient colors and hover tooltips.
                 - **Pie Chart**: For categorical columns with fewer unique values, generate a pie chart with distinct slice effects.
               - Surround each visualization section with a `try-except` block to log errors, ensuring program continuation even if one visualization fails.

            3. **Apply Styling for Aesthetics and Usability**:
               - Use a cohesive color palette, gradient fills, and animations for user engagement.
               - Implement responsive axes and optional log scaling for outliers.
               - Add smooth transitions and hover effects for an interactive experience.

            4. **Error Handling**:
               - Wrap each visualization in a try-except block and log errors for debugging.
               - Ensure code adaptability for missing data and varying dataset structures.

            **Provide clear, minimal comments** in the generated code to explain key sections but avoid verbose explanations.
            """

        try:
            response = model.generate_content(prompt)
            generated_code = response.text

            if generated_code:
                print("Generated code received, executing...")
                print(generated_code)
                execute_code_from_response(generated_code)  # execute the generated code from LLMs response
            else:
                print("No code generated by the model. Check the input or prompt.")
        except Exception as e:
            print(f"Error generating code: {e}")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")


import re
import subprocess
import sys
import importlib
import traceback
import ast

def install_required_packages(generated_code):
    """
    Parses the generated code to find all import statements, installs any missing packages,
    and returns a dictionary mapping aliases to the imported modules.
    """
    import_alias_map = {}
    unique_libraries = set()

    try:
        # Parse the generated code into an AST
        tree = ast.parse(generated_code)
        for node in ast.walk(tree):
            if isinstance(node, ast.Import):
                for alias in node.names:
                    module_name = alias.name
                    asname = alias.asname if alias.asname else module_name.split('.')[0]
                    unique_libraries.add(module_name.split('.')[0])  # Base library
                    import_alias_map[asname] = module_name
            elif isinstance(node, ast.ImportFrom):
                module = node.module
                if module is None:
                    continue  # Handle cases like 'from . import something'
                for alias in node.names:
                    full_module = f"{module}.{alias.name}"
                    asname = alias.asname if alias.asname else alias.name
                    unique_libraries.add(module.split('.')[0])  # Base library
                    import_alias_map[asname] = full_module
    except Exception as e:
        print(f"Error parsing import statements: {e}")
        return {}

    # Install missing libraries
    for library in unique_libraries:
        try:
            importlib.import_module(library)
        except ImportError:
            try:
                print(f"Installing '{library}'...")
                subprocess.check_call([sys.executable, "-m", "pip", "install", library])
                print(f"Successfully installed '{library}'.")
            except subprocess.CalledProcessError as e:
                print(f"Failed to install '{library}': {e}")
            except Exception as e:
                print(f"Unexpected error while installing '{library}': {traceback.format_exc()}")

    # Import modules and map aliases
    imported_modules = {}
    for alias, module_path in import_alias_map.items():
        try:
            if '.' in module_path:
                # For 'from module import name as alias'
                parts = module_path.split('.')
                base_module = '.'.join(parts[:-1])
                attr = parts[-1]
                module = importlib.import_module(base_module)
                imported_module = getattr(module, attr)
                imported_modules[alias] = imported_module
            else:
                # For 'import module as alias'
                module = importlib.import_module(module_path)
                imported_modules[alias] = module
        except AttributeError:
            print(f"Attribute '{parts[-1]}' not found in module '{base_module}'.")
        except ImportError as e:
            print(f"Failed to import '{module_path}': {e}")
        except Exception as e:
            print(f"Error importing '{module_path}': {traceback.format_exc()}")

    return imported_modules


In [28]:
generate("data.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Month     12 non-null     object
 1   Revenue   12 non-null     int64 
 2   Expenses  12 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 420.0+ bytes
Generated code received, executing...
```python
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Define function for formatting axis labels
def format_axis_label(value, unit):
    if abs(value) >= 1000000:
        return f"{value/1000000:.1f}M"
    elif abs(value) >= 1000:
        return f"{value/1000:.1f}K"
    else:
        return f"{value:.0f}"

# Load and prepare the data
try:
    data = [
        {'Month': 'January', 'Revenue': 2000, 'Expenses': 1500},
        {'Month': 'February', 'Revenue': 2500, 'Expenses': 1800},
        {'Month': 'March', 'Revenue': 3000, 'Ex

In [29]:
generate("data2.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Month     12 non-null     object
 1   Revenue   12 non-null     int64 
 2   Expenses  12 non-null     int64 
 3   Profit    12 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 516.0+ bytes
Generated code received, executing...
```python
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Define data types for parsing
data_types = {
    'Month': 'object',
    'Revenue': 'int64',
    'Expenses': 'int64',
    'Profit': 'int64'
}

# Load and prepare data
try:
    df = pd.DataFrame(
        [{'Month': 'January', 'Revenue': 10000, 'Expenses': 7000, 'Profit': 3000},
         {'Month': 'February', 'Revenue': 12000, 'Expenses': 8000, 'Profit': 4000},
         {'Month': 'March', 'Revenue': 11000, 'Expenses': 7500, 'Profit': 3500}

In [44]:
generate("data3.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Product       12 non-null     object
 1   Region        12 non-null     object
 2   Sales_Units   12 non-null     int64 
 3   Sales_Amount  12 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 516.0+ bytes
Generated code received, executing...
```python
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# 1. Load and Prepare the Data
data = [{'Product': 'Product_A', 'Region': 'North', 'Sales_Units': 500, 'Sales_Amount': 10000},
        {'Product': 'Product_B', 'Region': 'South', 'Sales_Units': 450, 'Sales_Amount': 9000},
        {'Product': 'Product_C', 'Region': 'East', 'Sales_Units': 600, 'Sales_Amount': 12000},
        {'Product': 'Product_A', 'Region': 'North', 'Sales_Units': 550, 'Sales_Amount': 11000},
        {'Product': 'Product_B', 'Region': 'Wes

In [40]:
generate("data4.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Employee_ID         10 non-null     object 
 1   Name                10 non-null     object 
 2   Department          10 non-null     object 
 3   Days_Present        10 non-null     int64  
 4   Performance_Rating  10 non-null     float64
dtypes: float64(1), int64(1), object(3)
memory usage: 532.0+ bytes


2024-11-02 17:06:40,522 - ERROR - Error creating categorical chart for Employee_ID: histogram() got an unexpected keyword argument 'color_continuous_scale'


Generated code received, executing...
```python
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import logging

# Configure logging
logging.basicConfig(filename='visualization_errors.log', level=logging.ERROR,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Dataset summary provided
columns = [{'name': 'Employee_ID', 'type': 'object', 'count': 10},
           {'name': 'Name', 'type': 'object', 'count': 10},
           {'name': 'Department', 'type': 'object', 'count': 4},
           {'name': 'Days_Present', 'type': 'int64', 'count': 6},
           {'name': 'Performance_Rating', 'type': 'float64', 'count': 6}]

# Load and prepare the data
data = []
for col in columns:
    if col['type'] == 'object':
        data.append(pd.Series([str(x) for x in range(col['count'])]))
    elif col['type'] == 'int64':
        data.append(pd.Series([x for x in range(col['count'])]))
    elif col['type'] ==

2024-11-02 17:06:40,540 - ERROR - Error creating categorical chart for Name: histogram() got an unexpected keyword argument 'color_continuous_scale'


2024-11-02 17:06:40,553 - ERROR - Error creating categorical chart for Department: histogram() got an unexpected keyword argument 'color_continuous_scale'


In [110]:
generate("data5.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Date                      10 non-null     object 
 1   Page_Views                10 non-null     int64  
 2   Unique_Visitors           10 non-null     int64  
 3   Bounce_Rate               10 non-null     float64
 4   Average_Session_Duration  10 non-null     float64
dtypes: float64(2), int64(2), object(1)
memory usage: 532.0+ bytes
Generated code received, executing...
```python
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Sample data from the provided summary (replace with your actual data)
data = {
    'Date': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08', '2024-01-09', '2024-01-10'],
    'Page_Views': [1800, 1900, 2000, 2100, 1600, 1700, 1850, 2050, 1950, 2150],
 

In [42]:
generate("data6.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Customer_ID      10 non-null     object 
 1   Product          10 non-null     object 
 2   Feedback_Rating  10 non-null     float64
 3   Review_Length    10 non-null     int64  
 4   Purchase_Count   10 non-null     int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 532.0+ bytes
Generated code received, executing...
```python
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Define a custom color palette for a cohesive and modern look
color_palette = px.colors.qualitative.Plotly  # Or choose another palette from Plotly

# Function to format axis labels and values dynamically
def format_axis_values(value):
    if abs(value) >= 1e6:
        return f"{value/1e6:.1f}M"
    elif abs(value) >= 1e3:
        ret

In [None]:
generate("data7.csv")

Generated code received, executing...
       Temperature(°C)  Humidity(%)  Rainfall(mm)
count          10.0000    10.000000     10.000000
mean            9.0000    51.200000     33.700000
std            10.1653    15.690053     37.160463
min            -5.0000    28.000000      1.000000
25%            -0.7500    42.750000      4.250000
50%            13.0000    49.000000      9.500000
75%            15.7500    64.250000     73.250000
max            22.0000    72.000000     85.000000


In [125]:
generate("large_dataset_500.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      500 non-null    object 
 1   Category  500 non-null    object 
 2   Value1    500 non-null    int64  
 3   Value2    500 non-null    float64
 4   Value3    500 non-null    float64
dtypes: float64(2), int64(1), object(2)
memory usage: 19.7+ KB
Generated code received, executing...
```python
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import logging

# Set up logging
logging.basicConfig(filename='visualization_errors.log', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')

# Load and prepare data
try:
    # Example dataset (replace with your actual data loading)
    data = {
        'Date': ['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04', '2020-01-05'],
        'Category': ['A', 'B', 'A', 'C', 'A'],
        'Value1': [10, 20, 

In [56]:
generate("large_dataset_1000.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Transaction_Date    1000 non-null   object 
 1   Category_Type       1000 non-null   object 
 2   Seasonal_Value      1000 non-null   float64
 3   Random_Fluctuation  1000 non-null   float64
 4   Linear_Trend_Value  1000 non-null   float64
 5   Status              1000 non-null   object 
dtypes: float64(3), object(3)
memory usage: 47.0+ KB
Generated code received, executing...
```python
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Sample data
data = [{'Transaction_Date': '2022-01-01', 'Category_Type': 'Category C', 'Seasonal_Value': 1002.43, 'Random_Fluctuation': 5132.61, 'Linear_Trend_Value': -35.11, 'Status': 'Pending'},
        {'Transaction_Date': '2022-01-02', 'Category_Type': 'Category A', 'Seasonal_Value': 963.46, 'Random_Fluctuation':

In [26]:
generate("large_dataset_10000.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Product_ID      10000 non-null  object 
 1   Product_Name    10000 non-null  object 
 2   Added_Date      10000 non-null  object 
 3   Category        10000 non-null  object 
 4   Price           10000 non-null  float64
 5   Stock_Quantity  10000 non-null  int64  
 6   Rating          10000 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 547.0+ KB
Generated code received, executing...
```python
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Define a function to handle error logging
def log_error(error_message):
    """Logs the error message to a file or console for debugging."""
    with open("error_log.txt", "a") as f:
        f.write(f"{error_message}\n")
    print(f"Error: {error_me

Error: Error creating time-series line chart: 
    Invalid value of type 'builtins.str' received for the 'shape' property of scattergl.line
        Received value: 'spline'

    The 'shape' property is an enumeration that may be specified as:
      - One of the following enumeration values:
            ['linear', 'hv', 'vh', 'hvh', 'vhv']


In [None]:
generate("diverse_dataset.csv")

Generated code received, executing...
```python
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Dataset Summary (provided)
dataset_summary = [
    {'name': 'Date', 'type': 'object', 'count': 1000},
    {'name': 'Category', 'type': 'object', 'count': 3},
    {'name': 'Value1', 'type': 'float64', 'count': 1000},
    {'name': 'Value2', 'type': 'float64', 'count': 1000},
    {'name': 'Value3', 'type': 'float64', 'count': 1000}
]
total_records = 1000

# Extract column information
column_names = [item['name'] for item in dataset_summary]
column_types = [item['type'] for item in dataset_summary]

# Load data (replace with actual data loading)
df = pd.DataFrame({
    'Date': pd.to_datetime(['2023-01-01'] * 1000),  # Placeholder, replace with real dates
    'Category': ['A'] * 333 + ['B'] * 333 + ['C'] * 334,
    'Value1': [10.0] * 1000,
    'Value2': [20.0] * 1000,
    'Value3': [30.0] * 1000
})

# Data Analysis
for i, column in enumerate(column_names):
   