### [LangGraph를 활용한 데이터 분석] 

vs community 2022 버전 확인

| STEP                  | Action                                                       | Next Step                                                                 |
|-----------------------|--------------------------------------------------------------|---------------------------------------------------------------------------|
| Start                 | Initialize the workflow                                      | retrieve_data                                                             |
| retrieve_data         | 데이터를 불러오고 전처리                                        | train_model                                                               |
| train_model           | Train the Decision Tree model 파라미터 변경된 걸로 재학습시키기    | evaluate_model                                                            |
| evaluate_model        | Evaluate the model's F1 score (평가 score에 따라서 분기가 3개로 나감) | visualize_performance (if F1 score > goal_threshold)                      |
|                       |                                                              | adjust_parameters (if F1 score ≤ goal_threshold and iterations < 20)      |
|                       |                                                              | handle_stop_condition (if iterations ≥ 20)                                |
| adjust_parameters     | max_depth를 하나씩 증가시키기                                   | train_model                                                               |
| handle_stop_condition | Stop the workflow due to reaching the maximum iterations 정해놓은 run 내에서 결과를 찾지 못하면 종료 | END                                                                       |
| visualize_performance | 모델 결과가 나오면 시각화 및 저장                                   | END                                                                       |
| END                   | End of the workflow                                          | -                                                                         |


In [None]:
# api key
from dotenv import load_dotenv

load_dotenv()

In [None]:
pip install langgraph-checkpoint-sqlite langgraph

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from typing import TypedDict, Optional
import matplotlib.pyplot as plt
from langgraph.graph import END, StateGraph
from langgraph.checkpoint.sqlite import SqliteSaver

# Define the GraphState class
class GraphState(TypedDict):
    data: Optional[pd.DataFrame] = None # 훈련 및 테스트 모델에 사용될 데이터셋을 저장 전처리된 특성 열을 포함한 pandas DataFrame
    target: Optional[pd.Series] = None # 데이터셋의 타겟(레이블)을 저장 pandas Series 형태
    params: Optional[dict] = None #	모델 훈련에 사용될 하이퍼파라미터를 저장 딕셔너리 형태로 키와 값으로 구성
    model: Optional[DecisionTreeClassifier] = None # 훈련된 결정 트리 모델을 저장 DecisionTreeClassifier 객체
    f1_score: Optional[float] = None # 모델의 평가 지표인 F1 점수를 저장
    response: Optional[str] = None # 현재 상태에 대한 응답 메시지를 저장 주로 디버깅이나 로깅 목적으로 사용
    goal_threshold: Optional[float] = None # 모델이 도달해야 하는 목표 F1 점수를 저장
    X_test: Optional[pd.DataFrame] = None # 테스트 데이터셋의 특성 데이터를 저장 pandas DataFrame 형태
    y_test: Optional[pd.Series] = None # 테스트 데이터셋의 타겟(레이블) 데이터를 저장 pandas Series 형태
    history: Optional[list] = None # 각 반복마다 모델의 파라미터와 F1 점수를 저장한 이력 리스트
    iterations: Optional[int] = None # 	모델 훈련 반복 횟수를 저장

# Load and preprocess the data
def retrieve_data(state: GraphState) -> GraphState:
    url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
    df = pd.read_csv(url)
    
    # Preprocessing steps
    df.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)
    # Replacing inplace=True with direct assignment
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    
    # Encode categorical variables
    le = LabelEncoder()
    df['Sex'] = le.fit_transform(df['Sex'])
    df['Embarked'] = le.fit_transform(df['Embarked'])
    
    target = df['Survived']
    df.drop(columns=['Survived'], inplace=True)
    
    return {**state, "data": df, "target": target, "params": {"max_depth": 2}, "history": [], "iterations": 0}

# Train the model
def train_model(state: GraphState) -> GraphState:
    data = state["data"]
    target = state["target"]
    params = state["params"]
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=42)
    model = DecisionTreeClassifier(**params)
    model.fit(X_train, y_train)
    return {**state, "model": model, "X_test": X_test, "y_test": y_test}

# # Evaluate the model
# def evaluate_model(state: GraphState) -> GraphState:
#     model = state["model"]
#     X_test = state["X_test"]
#     y_test = state["y_test"]
#     y_pred = model.predict(X_test)
#     f1_score_value = f1_score(y_test, y_pred, average='weighted')
#     history = state["history"]
#     history.append({"params": state["params"].copy(), "f1_score": f1_score_value})
#     return {**state, "f1_score": f1_score_value, "response": f"Model F1 score: {f1_score_value:.2f}", "history": history}

# Evaluate the model
def evaluate_model(state: GraphState) -> GraphState:
    model = state["model"]
    X_test = state["X_test"]
    y_test = state["y_test"]
    y_pred = model.predict(X_test)
    f1_score_value = f1_score(y_test, y_pred, average='weighted')
    history = state["history"]
    
    # Append current results to history
    history.append({"params": state["params"].copy(), "f1_score": f1_score_value})
    
    # Print each entry in the history one by one
    for entry in history:
        print(f"Max Depth: {entry['params']['max_depth']}, F1 Score: {entry['f1_score']}")
    
    return {**state, "f1_score": f1_score_value, "response": f"Model F1 score: {f1_score_value:.2f}", "history": history}


# Adjust parameters
def adjust_parameters(state: GraphState) -> GraphState:
    params = state["params"]
    params["max_depth"] += 1
    iterations = state["iterations"] + 1
    print(f"Adjusting parameters: Iteration {iterations}, Max Depth {params['max_depth']}")
    return {**state, "params": params, "iterations": iterations}

# Check if the accuracy is sufficient
def is_sufficient_accuracy(state: GraphState) -> str:
    goal_threshold = state["goal_threshold"]
    if state["f1_score"] and state["f1_score"] > goal_threshold:
        return "sufficient"
    if state["iterations"] and state["iterations"] >= 20:
        return "stop"
    return "insufficient"


# Handle stop condition
def handle_stop_condition(state: GraphState) -> GraphState:
    final_f1_score = state["f1_score"]  # 최종 F1 Score
    print(f"최종 Final F1 Score: {final_f1_score:.4f}")  # 최종 F1 Score 출력
    return {**state, "response": "Stopping the workflow due to reaching the maximum iterations."}

# Visualize the performance
def visualize_performance(state: GraphState) -> GraphState:
    history = state["history"]
    depths = [entry["params"]["max_depth"] for entry in history]
    f1_scores = [entry["f1_score"] for entry in history]

    plt.figure(figsize=(10, 6))
    plt.plot(depths, f1_scores, marker='o', linestyle='-', color='b')
    plt.title('Model Performance vs. Max Depth')
    plt.xlabel('Max Depth')
    plt.ylabel('F1 Score')
    plt.grid(True)
    plot_filename = 'model_performance.png'
    plt.savefig(plot_filename)
    plt.close()
    return {**state, "response": f"Model training complete. Performance plot saved as '{plot_filename}'", "plot_filename": plot_filename}

# Initialize the state graph
workflow = StateGraph(GraphState)

# Add nodes to the workflow
workflow.add_node("retrieve_data", retrieve_data)
workflow.add_node("train_model", train_model)
workflow.add_node("evaluate_model", evaluate_model)
workflow.add_node("adjust_parameters", adjust_parameters)
workflow.add_node("handle_stop_condition", handle_stop_condition)
workflow.add_node("visualize_performance", visualize_performance)

# Add edges to the workflow
workflow.add_edge("retrieve_data", "train_model")
workflow.add_edge("train_model", "evaluate_model")

# Add conditional edges for retraining if accuracy is insufficient
workflow.add_conditional_edges(
    "evaluate_model",
    is_sufficient_accuracy,
    {
        "sufficient": "visualize_performance",
        "insufficient": "adjust_parameters",
        "stop": "handle_stop_condition"
    }
)

workflow.add_edge("adjust_parameters", "train_model")
workflow.add_edge("handle_stop_condition", END)
workflow.add_edge("visualize_performance", END)
# Set entry point
workflow.set_entry_point("retrieve_data")

from langchain_core.runnables import RunnableConfig

# Increase the recursion limit
config = RunnableConfig(recursion_limit=2 + 3 * 20 + 3, configurable={"thread_id": "THREAD_ID"})

# Initialize SqliteSaver (사용 못함)
# memory = SqliteSaver.from_conn_string(":memory:")

# Compile the workflow with checkpointing
app = workflow.compile()

# Example input with goal_threshold
inputs = {
    "goal_threshold": 0.81 #80
}
result = app.invoke(inputs, config=config)

# Print the final result
print(result)

In [None]:
# Extracting the 'history' part from the result
history = result.get('history', [])
# Print the extracted history
print("Extracted History:")
for entry in history:
    print(f"Max Depth: {entry['params']['max_depth']}, F1 Score: {entry['f1_score']}")

In [None]:

from IPython.display import Image, display
display(Image(app.get_graph().draw_mermaid_png()))