In [1]:
from typing import TypedDict, Annotated
from langgraph.prebuilt import ToolNode
from langgraph.graph import START, StateGraph, END
from langgraph.prebuilt import tools_condition
from langchain_google_genai import ChatGoogleGenerativeAI
from IPython.display import Image, display
import pandas as pd
import numpy as np
import json

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("GOOGLE_API_KEY")

In [3]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",  
    temperature=0.6,
    google_api_key=api_key 
)

In [4]:
from typing import TypedDict, Optional
import pandas as pd
import operator

class PreprocessingState(TypedDict):
    raw_data: Optional[pd.DataFrame]
    cleaned_data: Optional[pd.DataFrame]

In [5]:
def correcting_inconsistent_data(state: PreprocessingState):
    df = state["raw_data"]
    initial_rows, initial_cols = df.shape
    df_json = df.to_json()

    prompt = f"""
You are a data preprocessing assistant.

Below is a JSON string representing a pandas DataFrame:
{df_json}

Original shape: {initial_rows} rows × {initial_cols} columns

Instructions:
1. Identify and standardize inconsistent text data (e.g., typos, case variations, abbreviations) in categorical columns. For example: 
    - Convert all variations like 'İstanbul', 'İSTANBUL', 'ist.', 'istanbul' → to a standardized form: 'istanbul' 
    - Apply case normalization by converting all text to lowercase (e.g., 'New York' → 'new york', 'USA' → 'usa')
    - Preserve original language characters (e.g., Turkish 'İ', 'ı', 'ş')
2. Ensure all rows and columns are aligned
3. Return STRICTLY ONLY JSON (NO COMMENTS, NO MARKDOWN, ONLY JSON!)(The JSON format should only contain the dataset. Don't include anything.).
    """
    
    response = llm.invoke(prompt).content

    json_string = response.strip().replace("```json\n", "").replace("\n```", "").strip("'")
    
    cleaned_df = pd.read_json(json_string)

    return {"cleaned_data": cleaned_df}


In [6]:
def missing_value(state: PreprocessingState):
    df = state["cleaned_data"]
    initial_rows, initial_cols = df.shape
    df_json = df.to_json()

    prompt = f"""
You are a data preprocessing assistant.

Below is a JSON string representing a pandas DataFrame:
{df_json}

Original shape: {initial_rows} rows × {initial_cols} columns

Instructions:
1. Process the DataFrame in the exact order below:

   Step 1: Column removal
   - Drop any column where more than 50% of entries are missing.
   - Drop any identifier column (case-insensitive match on "id", "ID", "url", or "URL").
   - Drop any column with extremely high cardinality (more than 1,000 unique values) that is unlikely to aid modeling.

   Step 2: Duplicate removal
   - Remove duplicate rows, keeping only the first occurrence.

   Step 3: Missing-value imputation
   - Treat as missing: `null`, `NaN`, `None`, or the string "nan".
   - For numeric columns (dtype int or float):
     1. Compute the median of all non-missing values.
     2. Replace all missing entries in that column with the computed median.
   - For categorical columns (dtype object or string):
     1. Compute the mode (most frequent non-missing value) excluding missing.
     2. Replace all missing entries in that column with the computed mode.
     3. Ensure this step applies to every categorical column.

2. Preserve the remaining rows and their original order (aside from removed duplicates).
3. Output the final processed DataFrame strictly as a JSON array of record objects.
4. Do not include any comments, markdown, explanations, or extra text—only the raw JSON array.
5. Return strictly only JSON (no additional text).
"""

    response = llm.invoke(prompt).content

    json_string = response.strip().replace("```json\n", "").replace("\n```", "").strip("'")
    
    cleaned_df = pd.read_json(json_string)

    return {"cleaned_data": cleaned_df}


In [7]:
def encoding_handler(state: PreprocessingState):
    df = state["cleaned_data"]
    rows, cols = df.shape
    df_json = df.to_json()

    prompt = f"""
You are a data preprocessing assistant specializing in categorical data encoding.

Below is a JSON string representing a pandas DataFrame:
{df_json}

Original shape: {rows} rows × {cols} columns

Instructions:
1. Identify all categorical (string) columns.
2. Encoding Strategy:
   If a column meets any of the following 3 criteria:
        -Has more than 10 unique values,
        -Consists of only 2 distinct categories, or
        -Represents an ordinal relationship (e.g., education level, difficulty level like *easy-medium-hard*),
    then apply label encoding to that column. Otherwise, use one-hot encoding.
3. Ensure all rows and columns are aligned.
4. Return STRICTLY ONLY JSON (NO COMMENTS, NO MARKDOWN, ONLY PURE JSON!). The output should be the transformed dataset with no additional text.
"""

    response = llm.invoke(prompt).content

    json_string = response.strip().replace("```json\n", "").replace("\n```", "").strip("'")
    
    cleaned_df = pd.read_json(json_string)

    return {"cleaned_data": cleaned_df}


In [8]:
def scaler(state:PreprocessingState):
    df = state["cleaned_data"]
    rows, cols = df.shape
    df_json = df.to_json()

    prompt = f"""
You are a data preprocessing assistant specializing in numerical data scaling.
Below is a JSON string representing a pandas DataFrame:
{df_json}

Original shape: {rows} rows × {cols} columns

Instructions:
1. Automatically detect all numeric columns (dtype int or float).
2. Apply StandardScaler to numeric columns, excluding the columns where label or one-hot encoding have been applied.
3. Ensure all rows and columns are aligned.
4. Return STRICTLY ONLY JSON (NO COMMENTS, NO MARKDOWN, ONLY PURE JSON!). The output should be the transformed dataset with no additional text.
"""
    response = llm.invoke(prompt).content

    json_string = response.strip().replace("```json\n", "").replace("\n```", "").strip("'")
    
    cleaned_df = pd.read_json(json_string)

    return {"cleaned_data": cleaned_df} 

In [9]:
builder = StateGraph(PreprocessingState)

In [10]:
builder.add_node("correcting_inconsistent_data", correcting_inconsistent_data)
builder.add_node("missing_value", missing_value)
builder.add_node("encoding", encoding_handler)
builder.add_node("scaler", scaler)

builder.add_edge(START,"correcting_inconsistent_data")
builder.add_edge("correcting_inconsistent_data", "missing_value")
builder.add_edge("missing_value", "encoding")
builder.add_edge("encoding", "scaler")
builder.add_edge("scaler", END)

graph= builder.compile()
#display(Image(graph.get_graph().draw_mermaid_png()))

ascii_art = graph.get_graph().draw_ascii()
print(ascii_art)

          +-----------+          
          | __start__ |          
          +-----------+          
                *                
                *                
                *                
+------------------------------+ 
| correcting_inconsistent_data | 
+------------------------------+ 
                *                
                *                
                *                
        +---------------+        
        | missing_value |        
        +---------------+        
                *                
                *                
                *                
          +----------+           
          | encoding |           
          +----------+           
                *                
                *                
                *                
           +--------+            
           | scaler |            
           +--------+            
                *                
                *                
              

In [11]:
np.random.seed(42)
# Generate synthetic data
data = {
    'age': np.random.randint(18, 70, 50).astype(float),
    'income': np.random.normal(50000, 15000, 50),
    'city': np.random.choice(['New York', 'LONDON', 'paris', 'TOKYO', 'new york', 'London', np.nan], 
                           50, p=[0.2, 0.15, 0.15, 0.15, 0.15, 0.15, 0.05]),
    'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD', np.nan], 
                50, p=[0.3, 0.4, 0.2, 0.05, 0.05]),
    'gender': np.random.choice(['Male', 'Female', np.nan], 50, p=[0.45, 0.45, 0.1])
}

# Introduce null values in numeric columns
mask = np.random.random((50, 2)) < 0.1  # 10% nulls for numeric cols
data['age'] = np.where(mask[:, 0], np.nan, data['age'])
data['income'] = np.where(mask[:, 1], np.nan, data['income'])

# Create DataFrame
df = pd.DataFrame(data)

In [12]:
df

Unnamed: 0,age,income,city,education,gender
0,56.0,40990.419651,LONDON,High School,Male
1,69.0,45624.593753,TOKYO,Bachelor,Female
2,46.0,40974.400817,London,,Female
3,32.0,,LONDON,High School,Male
4,60.0,49797.541629,paris,Bachelor,Male
5,25.0,34134.336066,new york,Master,Female
6,38.0,62338.173682,LONDON,High School,Male
7,56.0,31687.34525,New York,Master,Male
8,36.0,53132.953925,LONDON,Bachelor,Female
9,40.0,20604.948142,New York,Bachelor,Female


In [13]:
final_state = graph.invoke({"raw_data":df})

  cleaned_df = pd.read_json(json_string)
  cleaned_df = pd.read_json(json_string)
  cleaned_df = pd.read_json(json_string)
  cleaned_df = pd.read_json(json_string)


In [14]:
final_state["cleaned_data"]

Unnamed: 0,age,income,city_london,city_new york,city_paris,city_tokyo,education,gender
0,0.717494,-0.541923,1,0,0,0,-0.883883,1
1,1.627986,-0.339067,0,0,0,1,-1.897927,0
2,0.014753,-0.542525,1,0,0,0,-1.897927,0
3,-0.895739,-0.072364,1,0,0,0,-0.883883,1
4,0.95087,-0.042973,0,0,1,0,-1.897927,1
5,-1.36148,-0.834519,0,1,0,0,0.129505,0
6,-0.53,0.585622,1,0,0,0,-0.883883,1
7,0.717494,-0.93495,0,1,0,0,0.129505,1
8,-0.641183,0.103715,1,0,0,0,-1.897927,0
9,-0.306624,-1.497794,0,1,0,0,-1.897927,0
