In [1]:
import os
os.chdir('C:\\Users\\HP\\OneDrive\\Desktop\\Akaike\\Basics')
os.getcwd()

'C:\\Users\\HP\\OneDrive\\Desktop\\Akaike\\Basics'

In [None]:
import pandas as pd
import plotly.graph_objects as go
from dotenv import load_dotenv, find_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
import openai
import json
import os
from sqlalchemy import create_engine,MetaData,text
import plotly.graph_objects as go
load_dotenv(".env")

In [None]:
from langchain.schema import HumanMessage, SystemMessage
from langchain_community.chat_models import  ChatOpenAI
from loguru import logger
import os


In [None]:
import pandas as pd
import re

def identify_columns(df):
    categorical_cols = {}
    date_cols = {}
    numerical_cols = {}

    # Define a regular expression for typical date patterns
    date_pattern = re.compile(r'\b\d{1,2}[-/](Jan.*|Feb.*|Mar.*|Apr.*|May|Jun.*|Jul.*|Aug.*|Sep.*|Oct.*|Nov.*|Dec.*)[a-z]*[-/]\d{2,4}\b', re.IGNORECASE)
    currency_pattern = re.compile(r'^[$€£₹]\s?(?:\d{1,3}(?:[,]\d{3})*(?:[,]\d{3})*(?:[.]\d{2})?|\d+(?:[.]\d{2})?)$', re.IGNORECASE)


    for col in df.columns:
        # For object types, check if the column values match the date pattern or currency pattern
        if df[col].dtype == 'object':
            # If any value matches the date pattern, attempt conversion
            if any(date_pattern.match(str(x)) for x in df[col]):
                temp_col = pd.to_datetime(df[col], errors='coerce')
                if not temp_col.isnull().all():  # If conversion is successful
                    df[col] = temp_col
                    date_cols[col] = {"range":(df[col].min(),df[col].max())}
                    continue

            # If any value matches the currency pattern, consider it numerical
            elif any(currency_pattern.match(str(x)) for x in df[col]):
                try:
                    temp_col = pd.to_numeric(df[col].str.replace(r'[$€£₹,]', ''), errors='coerce')
                    if not temp_col.isnull().all():
                        df[col] = temp_col
                        numerical_cols[col] = {"range":(df[col].min(), df[col].max())}
                        continue
                except:
                    pass

        # For int types, attempt conversion only if values are in a typical timestamp range
        elif df[col].dtype == 'int64':
            if df[col].between(1e9, 1e12).any():  # Rough range for UNIX timestamps
                temp_col = pd.to_datetime(df[col], unit='s', errors='coerce')
                if not temp_col.isnull().all():
                    df[col] = temp_col
                    date_cols[col] = {"range":(df[col].min(), df[col].max())}
                    continue

        # Identify categorical columns
        if df[col].dtype == 'object' or df[col].dtype.name == 'category':
            try:
                temp_dates = pd.to_datetime(df[col], errors='coerce')
                valid_dates = temp_dates.dropna()
                if not valid_dates.empty:
                    min_date = valid_dates.min()
                    max_date = valid_dates.max()
                    date_cols[col] = (min_date, max_date)
            except:
                pass
            if df[col].nunique() < 10:
                categorical_cols[col] = df[col].unique().tolist()

        # Identify date columns (for columns already in datetime format)
        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            date_cols[col] = {"range":(df[col].min(), df[col].max())}
        
        # Identify numerical columns
        elif df[col].dtype in ['int64', 'float64']:
            if df[col].nunique() < 5:
                categorical_cols[col] = df[col].unique().tolist()
            else:
                numerical_cols[col] = {"range":(df[col].min(), df[col].max())}

    return {"categorial_columns": categorical_cols, "numerical_columns": numerical_cols, "date_columns": date_cols}


In [None]:
def generate_plotly_graph(data_dict, additional_params=None, group_by=None, barmode=None):
    # Extract information from the dictionary
    x_axis = data_dict['x_axis']
    y_axis = data_dict['y_axis']
    chart_type = data_dict['chart_type']
    df_str = data_dict['resultant_df']

    # Converting the 'resultant_df' string to a DataFrame
    if isinstance(df_str, str):
        df = pd.read_csv(StringIO(df_str))
    else:
        df = df_str

    for col in df.columns:
        if isinstance(df[col].dtype, pd.IntervalDtype):
            df[col] = df[col].apply(lambda x: str(x))

    # Initialize a figure object
    fig = go.Figure()

    # Conditional logic based on chart type
    if chart_type == 'bar':
        return bar(chart_type=chart_type, df=df, x_axis=x_axis, y_axis=y_axis)
    elif chart_type == 'grouped_bar':
        return grouped_bar(result_df=df, x_axis=x_axis, y_axis=y_axis, group_by=group_by)
    elif chart_type == 'line':
        if group_by:
            # Create a multiline chart
            for name, group in df.groupby(group_by):
                fig.add_trace(go.Scatter(x=group[x_axis], y=group[y_axis], mode='lines', name=name))
        else:
            # Create a standard line chart
            fig.add_trace(go.Scatter(x=df[x_axis], y=df[y_axis], mode='lines'))
            fig.update_layout(margin={'pad': 20}, xaxis_title=x_axis, yaxis_title=y_axis) 

    elif chart_type == 'scatter':
        # Scatter plot
        if group_by:
            # Create a grouped scatter plot
            for name, group in df.groupby(group_by):
                fig.add_trace(go.Scatter(x=group[x_axis], y=group[y_axis], mode='markers', name=name))
        else:
            # Create a standard scatter plot
            fig.add_trace(go.Scatter(x=df[x_axis], y=df[y_axis], mode='markers'))

    elif chart_type == 'heatmap':
        # Heatmap
        fig.add_trace(go.Heatmap(z=df[y_axis], x=df[x_axis], y=df.index))

    elif chart_type in ['area line', 'surface_area']:
        # Area line chart logic
        if group_by:
            for name, group in df.groupby(group_by):
                fig.add_trace(go.Scatter(x=group[x_axis], y=group[y_axis], fill='tozeroy', name=name))
        else:
            fig.add_trace(go.Scatter(x=df[x_axis], y=df[y_axis], fill='tozeroy'))

    elif chart_type in ['multiline', 'multi_line']:
        # Multiline chart logic
        if group_by:
            for name, group in df.groupby(group_by):
                fig.add_trace(go.Scatter(x=group[x_axis], y=group[y_axis], mode='lines', name=name))
        else:
            fig.add_trace(go.Scatter(x=df[x_axis], y=df[y_axis], mode='lines'))
        fig.update_layout(margin={'pad': 20}, xaxis_title=x_axis, yaxis_title=y_axis)

    elif chart_type == 'scattercarpet':
        # Scattercarpet plot logic
        if group_by:
            for name, group in df.groupby(group_by):
                fig.add_trace(go.Scattercarpet(x=group[x_axis], y=group[y_axis], name=name))
        else:
            fig.add_trace(go.Scattercarpet(x=df[x_axis], y=df[y_axis]))

    elif chart_type == 'stacked_bar':
        for col in df.columns:
            if col != x_axis:
                fig.add_trace(go.Bar(x=df[x_axis], y=df[col], name=col, text=df[col], textposition='auto'))
        barmode = barmode or 'stack'

    else:
        # Other chart types like pie, area, etc.
        if chart_type == 'pie':
            df = df.sort_values(by=y_axis, ascending=False)
            labels, values = list(df[x_axis]), list(df[y_axis])
            return pie_charts(labels=labels, values=values)
        elif chart_type == 'area':
            # Area chart logic
            if group_by:
                for name, group in df.groupby(group_by):
                    fig.add_trace(go.Scatter(x=group[x_axis], y=group[y_axis], fill='tonexty', name=name))
            else:
                fig.add_trace(go.Scatter(x=df[x_axis], y=df[y_axis], fill='tonexty'))

    fig.update_xaxes(title_text=data_dict['x_axis'])
    fig.update_yaxes(title_text=data_dict['y_axis'])

    # Update layout according to barmode and heading
    if barmode:
        fig.update_layout(barmode=barmode)

    # If there are additional parameters, update the layout with them
    if additional_params:
        fig.update_layout(**additional_params)

    fig.show()
    json_config = json.loads(fig.to_json())

    return json_config


In [None]:
import pandas as pd

class PandasResultantDataframe:
    '''
    Class to execute required pandas code
    '''

    def __init__(self, chart_config, df, filter_columns):
        self.chart_config = chart_config
        self.df2 = df
        self.freq_dict = {
            'business_day': 'B', 'calendar_day': 'D', 'weekly': 'W', 'monthly': 'M', 'quarterly': 'Q',
            'yearly': 'A', 'hourly': 'H', 'minutely': 'T', 'secondly': 'S', 'milliseconds': 'L',
            'microseconds': 'U', 'nanoseconds': 'N'
        }
        self.filter_columns = filter_columns

    def _apply_filters(self, df):
        # Apply filters for numerical columns
        for col, conditions in self.filter_columns.get('numerical_columns', {}).items():
            start_value = conditions.get('start_value')
            end_value = conditions.get('end_value')
            if start_value is not None:
                df = df[df[col] >= start_value]
            if end_value is not None:
                df = df[df[col] <= end_value]

        # Apply filters for date columns
        for col, conditions in self.filter_columns.get('date_columns', {}).items():
            start_date = conditions.get('start_date')
            end_date = conditions.get('end_date')
            if start_date and end_date:
                df[col] = pd.to_datetime(df[col])
                df = df[(df[col] >= pd.to_datetime(start_date)) & (df[col] <= pd.to_datetime(end_date))]
            elif start_date:
                df = df[df[col] >= pd.to_datetime(start_date)]
            elif end_date:
                df = df[df[col] <= pd.to_datetime(end_date)]

        # Apply filters for categorical columns
        for col, values in self.filter_columns.get('categorical_columns', {}).items():
            if values:
                df = df[df[col].isin(values)]

        return df

    def time_series_resultant_df(self):
        x_axis = self.chart_config['x_axis']
        y_axis = self.chart_config['y_axis']
        operation = self.chart_config.get('operation', "mean")
        df1 = self._apply_filters(self.df2.copy())
        if "binning" in self.chart_config.keys():
            binning = self.chart_config['binning']  ## yearly, monthly, daily
            if binning == '':
                binning = 'monthly'
        elif df1.shape[0] > 720:
            binning = 'yearly'
        elif df1.shape[0] > 60:
            binning = 'monthly'
        else:
            binning = 'daily'
        # Ensure x_axis is datetime if it's not an integer
        if df1[x_axis].dtype != "int64":
            df1[x_axis] = pd.to_datetime(df1[x_axis])

            if binning == 'quarterly':
                # if df1[x_axis].dt.year.nunique() > 1:
                # Include only the last year's data
                #     last_year = df1[x_axis].dt.year.max()
                #     df1_last_year = df1[df1[x_axis].dt.year == last_year]
                #     df1_last_year['Quarter'] = df1_last_year[x_axis].dt.to_period('Q').astype(str)
                #     resultant_df = df1_last_year.groupby('Quarter')[y_axis].agg(operation)
                # last 4 quarters
                #     last_4_quarters = sorted(df1[x_axis].dt.to_period('Q').unique())[-4:]
                #     df1['Quarter'] = df1[x_axis].dt.to_period('Q').astype(str)
                #     resultant_df = df1[df1[x_axis].dt.to_period('Q').isin(last_4_quarters)].groupby('Quarter')[y_axis].agg(operation)
                # else:
                #     df1['Quarter'] = df1[x_axis].dt.to_period('Q').dt.strftime('Q%q')
                #     resultant_df = df1.groupby('Quarter')[y_axis].agg(operation)
                ## For all years      
                if df1[x_axis].dt.year.nunique() > 1:
                    df1['Quarter'] = df1[x_axis].dt.to_period('Q').astype(str)
                    resultant_df = df1.groupby('Quarter')[y_axis].agg(operation)
                else:
                    df1['Quarter'] = df1[x_axis].dt.to_period('Q').dt.strftime('Q%q')
                    resultant_df = df1.groupby('Quarter')[y_axis].agg(operation)
            else:
                df1['binning'] = df1[x_axis].dt.to_period(self.freq_dict[binning]).dt.to_timestamp()
                resultant_df = df1.groupby('binning')[y_axis].agg(operation)

        else:
            resultant_df = df1.groupby(x_axis)[y_axis].agg(operation)
        resultant_df = resultant_df.reset_index()
        resultant_df.columns = [x_axis, y_axis]

        return resultant_df

    def group_aggregates_resultant_df(self):
        x_axis = self.chart_config['x_axis']
        y_axis = self.chart_config['y_axis']
        operation = self.chart_config.get('operation', "mean")
        df1 = self._apply_filters(self.df2.copy())

        if x_axis == y_axis:
            resultant_df = df1.groupby(x_axis).size().reset_index(name='count')
            self.chart_config['y_axis'] = 'count'
        else:
            resultant_df = df1.groupby(x_axis)[y_axis].agg(operation).reset_index()

        return resultant_df

    def combination_resultant_df(self):
        chart_config = self.chart_config
        df1 = self._apply_filters(self.df2.copy())
        x_axis = chart_config['x_axis']
        y_axis = chart_config['y_axis']
        group_by = chart_config['group_by']
    
        if "binning" in self.chart_config.keys():
            binning = self.chart_config['binning']  ## yearly, monthly, daily
            if binning == '':
                binning = 'monthly'
        elif df1.shape[0] > 720:
            binning = 'yearly'
        elif df1.shape[0] > 60:
            binning = 'monthly'
        else:
            binning = 'daily'

        operation = self.chart_config.get('operation', "mean")

        df1[x_axis] = pd.to_datetime(df1[x_axis])
        df1[binning] = df1[x_axis].dt.strftime('%Y-%m')
    
        if binning == 'quarterly':
            if df1[x_axis].dt.year.nunique() > 1:
                df1['Quarter'] = df1[x_axis].dt.to_period('Q').astype(str)
                resultant_df = df1.groupby(['Quarter', group_by])[y_axis].agg(operation).unstack().reset_index()
                resultant_df = resultant_df.melt(id_vars=['Quarter'], var_name=group_by, value_name=y_axis)
                resultant_df.columns = [x_axis, group_by, y_axis]
            else:
                df1['Quarter'] = df1[x_axis].dt.to_period('Q').dt.strftime('Q%q')
                resultant_df = df1.groupby(['Quarter', group_by])[y_axis].agg(operation).unstack().reset_index()
                resultant_df = resultant_df.melt(id_vars=['Quarter'], var_name=group_by, value_name=y_axis)
                resultant_df.columns = [x_axis, group_by, y_axis]
        else:
            resultant_df = df1.groupby([binning, group_by])[y_axis].agg(operation).unstack().reset_index()
            resultant_df = resultant_df.melt(id_vars=[binning], var_name=group_by, value_name=y_axis)
            resultant_df.columns = [x_axis, group_by, y_axis]

        return resultant_df


    def multilevel_categorical(self):
        df = self._apply_filters(self.df2.copy())
        chart_config = self.chart_config
        x_axis = chart_config.get("x_axis")
        y_axis = chart_config.get("y_axis")
        group_by = chart_config.get("group_by")
        operation = chart_config.get("operation", "mean")
        start_date = chart_config.get("start_date", None)
        end_date = chart_config.get("end_date", None)

        # Filter DataFrame based on start_date and end_date if provided
        if start_date and end_date:
            df = df[(df['Order_Date'] >= start_date) & (df['Order_Date'] <= end_date)]

        # Perform groupby operation
        if operation is None:
            operation = 'mean'
        if operation == 'mean':
            result_df = df.groupby([x_axis, group_by])[y_axis].mean().reset_index()
        elif operation == 'sum':
            result_df = df.groupby([x_axis, group_by])[y_axis].sum().reset_index()
        elif operation == 'count':
            result_df = df.groupby([x_axis, group_by])[y_axis].count().reset_index()
        else:
            result_df = df.groupby([x_axis])[y_axis].agg(operation).reset_index()
            self.chart_config['y_axis'] = 'count'

        return result_df


In [None]:
global_stock = pd.read_csv("../byob_test_datasets/E-commerce Dataset.csv")
global_stock_head = global_stock.head().to_csv(index = False)

In [None]:
global_stock.head()

In [None]:
data = {
    'Order_Date': ['2018-01-01', '2018-02-01', '2018-03-01', '2018-04-01', '2018-05-01', '2018-06-01', 
                   '2018-07-01', '2018-08-01', '2018-09-01', '2018-10-01', '2018-11-01', '2018-12-01',
                   '2019-01-01', '2019-02-01', '2019-03-01', '2019-04-01', '2019-05-01', '2019-06-01', 
                   '2019-07-01', '2019-08-01', '2019-09-01', '2019-10-01', '2019-11-01', '2019-12-01',
                   '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01', '2020-05-01', '2020-06-01', 
                   '2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01', '2020-11-01', '2020-12-01'],
    'Profit': [200000, 300000, 250000, 300000, 400000, 350000, 350000, 300000, 250000, 375000, 400000, 250000,
               220000, 320000, 270000, 310000, 420000, 370000, 360000, 310000, 260000, 385000, 410000, 260000,
               230000, 330000, 280000, 320000, 430000, 380000, 370000, 320000, 270000, 395000, 420000, 270000],
}

df=pd.DataFrame(data)

In [None]:
query = "Yearly Loan Approval Trend by State"

In [None]:
chart_config ={'x_axis': 'Order_Date',
 'y_axis': 'Profit',
 'binning': 'quarterly',
 'heading': 'Quarterly Profit Trend by Order_Date',
 'group_by': '',
 'operation': 'sum',
 'chart_type': 'line',
 'start_date': None,
 'end_date': None}

In [None]:
pandas_resultant_df = PandasResultantDataframe(chart_config,global_stock,filter_columns={})
resultant_df = pandas_resultant_df.time_series_resultant_df()
chart_config['resultant_df'] = resultant_df


In [None]:
resultant_df

In [None]:
generate_plotly_graph(chart_config)