In [None]:
!pip install -U langchain-gigachat langchain-core langchain-community


Collecting langchain-gigachat
  Downloading langchain_gigachat-0.3.12-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-core
  Downloading langchain_core-1.0.4-py3-none-any.whl.metadata (3.5 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting gigachat<0.2.0,>=0.1.41.post1 (from langchain-gigachat)
  Downloading gigachat-0.1.43-py3-none-any.whl.metadata (15 kB)
Collecting types-requests<3.0,>=2.32 (from langchain-gigachat)
  Downloading types_requests-2.32.4.20250913-py3-none-any.whl.metadata (2.0 kB)
INFO: pip is looking at multiple versions of langchain-community to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-community
  Downloading langchain_community-0.4-py3-none-any.whl.metadata (3.0 kB)
  Downloading langchain_community-0.3.31-py3-none-any.whl.metadata (3.0 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading request

**!!Warning!!** When running the code in Google Colab, the session should be restarted after the installation to avoid library conflicts.

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from typing import Optional, List, Literal, Dict, Any

from langchain_gigachat import GigaChat
from langchain_core.tools import Tool
from langchain_core.prompts import ChatPromptTemplate
from langchain.agents import AgentExecutor, create_react_agent

## Titanic Dataset

Load [titanic](https://gist.github.com/michhar/2dfd2de0d4f8727f873422c5d959fff5) dataset.

    VARIABLE DESCRIPTIONS:
    survival        Survival
                (0 = No; 1 = Yes)
    pclass          Passenger Class
                (1 = 1st; 2 = 2nd; 3 = 3rd)
    name            Name
    sex             Sex
    age             Age
    sibsp           Number of Siblings/Spouses Aboard
    parch           Number of Parents/Children Aboard
    ticket          Ticket Number
    fare            Passenger Fare
    cabin           Cabin
    embarked        Port of Embarkation
                (C = Cherbourg; Q = Queenstown; S = Southampton)

    SPECIAL NOTES:
    Pclass is a proxy for socio-economic status (SES)
     1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower

    Age is in Years; Fractional if Age less than One (1)
     If the Age is Estimated, it is in the form xx.5

In [20]:
df = pd.read_csv('https://grantmlong.com/data/titanic.csv')
cols_2_drop = ['PassengerId', 'Ticket', 'Cabin', 'Name']
df = df.drop(cols_2_drop, axis=1)
df.loc[:, 'Age'] = df.Age.fillna(-999)
df = df.dropna()
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


### EDA

In [21]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,889.0,889.0,889.0,889.0,889.0,889.0
mean,0.382452,2.311586,-175.160664,0.524184,0.382452,32.096681
std,0.48626,0.8347,411.196729,1.103705,0.806761,49.697504
min,0.0,1.0,-999.0,0.0,0.0,0.0
25%,0.0,2.0,6.0,0.0,0.0,7.8958
50%,0.0,3.0,24.0,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    object 
 3   Age       889 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 62.5+ KB


## Table tools

In [23]:
# --- 2)Tools
def tool_list_columns(_: str = "") -> str:
    return "Columns:\n" + "\n".join([f"- {c} ({df[c].dtype}, nulls={df[c].isna().sum()})" for c in df.columns])

def tool_head(n_str: str = "5") -> str:
    n = int(n_str) if str(n_str).isdigit() else 5
    return df.head(n).to_markdown(index=False)

def tool_describe(_: str = "") -> str:
    # Для числовых колонок
    return df.describe(include='all').transpose().to_markdown()

def tool_value_counts(column: str) -> str:
    if column not in df.columns:
        return f"Column '{column}' not found."
    return df[column].value_counts(dropna=False).to_markdown()

def tool_filter_query(expr: str) -> str:
    """
    expr — выражение для pandas.DataFrame.query, пример:
    "sex == 'female' and pclass == 1"
    """
    try:
        sub = df.query(expr)
        return f"Rows: {len(sub)}\nPreview:\n" + sub.head(10).to_markdown(index=False)
    except Exception as e:
        return f"Query error: {e}"

def tool_groupby_agg(args: str) -> str:
    """
    args формат: "by=sex,pclass; metric=survived; agg=mean"
    """
    try:
        # parsing
        parts = dict([p.split("=") for p in args.split(";") if "=" in p])
        by_cols = [c.strip() for c in parts.get("by", "").split(",") if c.strip()]
        metric = parts.get("metric", "").strip()
        agg = parts.get("agg", "mean").strip()
        if not by_cols or not metric:
            return "Use format: by=col1,col2; metric=survived; agg=mean"
        sub = df.groupby(by_cols)[metric].agg(agg).reset_index()
        return sub.to_markdown(index=False)
    except Exception as e:
        return f"groupby error: {e}"

def tool_plot_bar(args: str) -> str:
    """
    Строит bar chart и сохраняет в файл.
    args формат: "by=sex; metric=survived; agg=mean; fname=bar.png"
    """
    try:
        parts = dict([p.split("=") for p in args.split(";") if "=" in p])
        by = parts.get("by", "").strip()
        metric = parts.get("metric", "").strip()
        agg = parts.get("agg", "mean").strip()
        fname = parts.get("fname", "plot.png").strip()
        if not by or not metric:
            return "Use format: by=sex; metric=survived; agg=mean; fname=plot.png"

        grp = df.groupby(by)[metric].agg(agg).reset_index()
        plt.figure()
        plt.bar(grp[by].astype(str), grp[metric])
        plt.xticks(rotation=30)
        plt.title(f"{agg}({metric}) by {by}")
        plt.tight_layout()
        plt.savefig(fname)
        plt.close()
        return f"Saved figure to {fname}"
    except Exception as e:
        return f"plot error: {e}"


In [24]:
tools = [
    Tool(name="list_columns", func=tool_list_columns,
         description="List dataframe columns with dtypes and null counts."),
    Tool(name="head", func=tool_head,
         description="Show the first N rows. Input is integer N (as string)."),
    Tool(name="describe", func=tool_describe,
         description="Describe numeric and categorical columns."),
    Tool(name="value_counts", func=tool_value_counts,
         description="Value counts for a column. Input is the column name."),
    Tool(name="filter_query", func=tool_filter_query,
         description="Filter rows using pandas query syntax, e.g., \"sex == 'female' and pclass == 1\"."),
    Tool(name="groupby_agg", func=tool_groupby_agg,
         description="Groupby & aggregate. Input format: 'by=col1,col2; metric=survived; agg=mean'"),
    Tool(name="plot_bar", func=tool_plot_bar,
         description="Bar plot of an aggregate. Input format: 'by=sex; metric=survived; agg=mean; fname=plot.png'"),
]

## GigaChat agent

In [25]:
token = 'MGUyM2MwNTItNTMyMy00ZjYyLThmNTUtMWE4Y2RjMGVkZGNmOmFkOGNkNGVmLTJhYzctNDNlMi04ZjQwLWZkNDQ3MDc4NTQwZQ=='

In [26]:
llm = GigaChat(
    credentials=token,
    scope="GIGACHAT_API_CORP",
    verify_ssl_certs=False,
    model="GigaChat-2-Max",                 # GigaChat-2 / -2-Pro / -2-Max
)
llm.temperature = 0.0

In [27]:
from langchain.agents import create_react_agent, AgentExecutor
from langchain_core.prompts import ChatPromptTemplate

# --- ReAct-prompt
react_template = """You are a helpful data analysis assistant.

You can use the following tools:
{tools}

Use this format exactly:
Question: {input}
Thought: what you think to do
Action: the tool name (one of [{tool_names}])
Action Input: the input for the action
Observation: the result of the action
... (repeat Thought/Action/Action Input/Observation as needed)
Thought: I now know the answer
Final Answer: a concise and factual answer.

{agent_scratchpad}
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", "Follow the format carefully and avoid fabricating data."),
    ("human", react_template),
])

## Agent

In [28]:
# ---  ReAct-agent
agent = create_react_agent(llm=llm, tools=tools, prompt=prompt)

# --- Executor
agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True,
    handle_parsing_errors=True
)


agent_executor.invoke({"input": "What are the main columns and their types? Show first 5 rows."})




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mQuestion: What are the main columns and their types? Show first 5 rows.
Thought: First, I'll list all the columns along with their data types and null counts.
Action: list_columns
Action Input:[0m[36;1m[1;3mColumns:
- Survived (int64, nulls=0)
- Pclass (int64, nulls=0)
- Sex (object, nulls=0)
- Age (float64, nulls=0)
- SibSp (int64, nulls=0)
- Parch (int64, nulls=0)
- Fare (float64, nulls=0)
- Embarked (object, nulls=0)[0m[32;1m[1;3mNext, let's show the first five rows of the dataset.
Action: head
Action Input:[0m[33;1m[1;3m|   Survived |   Pclass | Sex    |   Age |   SibSp |   Parch |    Fare | Embarked   |
|-----------:|---------:|:-------|------:|--------:|--------:|--------:|:-----------|
|          0 |        3 | male   |    22 |       1 |       0 |  7.25   | S          |
|          1 |        1 | female |    38 |       1 |       0 | 71.2833 | C          |
|          1 |        3 | female |    26 |       0 |    

{'input': 'What are the main columns and their types? Show first 5 rows.',
 'output': 'The main columns in the dataset include `Survived`, `Pclass`, `Sex`, `Age`, `SibSp`, `Parch`, `Fare`, and `Embarked`. Their respective data types are int64, int64, object, float64, int64, int64, float64, and object. There are no missing values across these columns. Here are the first five rows of the dataset:\n\n```\n|   Survived |   Pclass | Sex    |   Age |   SibSp |   Parch |    Fare | Embarked   |\n|-----------:|---------:|:-------|------:|--------:|--------:|--------:|:-----------|\n|          0 |        3 | male   |    22 |       1 |       0 |  7.25   | S          |\n|          1 |        1 | female |    38 |       1 |       0 | 71.2833 | C          |\n|          1 |        3 | female |    26 |       0 |       0 |  7.925  | S          |\n|          1 |        1 | female |    35 |       1 |       0 | 53.1    | S          |\n|          0 |        3 | male   |    35 |       0 |       0 |  8.05   |