<a href="https://colab.research.google.com/github/rpa-KMG/Gen-Data-AI/blob/main/GenData_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***INSTALLATION OF FLASK NGROK***

In [None]:
# @title
!pip install flask openpyxl pyngrok

***DRIVE PATH***

In [None]:
# @title
import os

print("Schema dir exists:", os.path.exists("/content/drive/MyDrive/Colab Notebooks"))
print("Files in schema dir:", os.listdir("/content/drive/MyDrive/Colab Notebooks"))


***GENDATA AI- SOURCE CODE***

In [None]:
# @title

# ======================= IMPORTS =======================

from flask import Flask, request, send_file, render_template_string
from datetime import datetime
from openpyxl import load_workbook
from pyngrok import ngrok
import random
import json
import tempfile
import os
import re
import random
from google.colab import drive
drive.mount('/content/drive')
from openpyxl.utils.cell import coordinate_to_tuple

def is_merged_follower_cell(ws, cell):
    for merged_range in ws.merged_cells.ranges:
        if cell.coordinate in merged_range:
            # top-left cell of merged range
            min_row, min_col, _, _ = merged_range.bounds
            if (cell.row, cell.column) != (min_row, min_col):
                return True
    return False


# ======================= HTML =======================

HTML_FORM = """
<!doctype html>
<html>
<head>
  <title>GenData AI</title>
  <style>
    body {
      font-family: Arial;
      background:#f4f6f9;
      display:flex;
      justify-content:center;
      align-items:center;
      height:100vh;
      margin:0;
    }
    .box {
      background:white;
      padding:30px;
      border-radius:10px;
      width:420px;
      box-shadow:0 4px 10px rgba(0,0,0,.1);
    }
    h2 { text-align:center; margin-bottom:5px; }
    h4 { text-align:center; margin:0 0 20px 0; font-weight:normal; color:#555; }

    form {
      display:flex;
      flex-direction:column;
      gap:14px;   /* single source of spacing */
    }

    label {
      font-weight:bold;
      margin:0;
    }

    input, select {
      width:100%;
      padding:10px;
      border-radius:5px;
      border:1px solid #ccc;
      font-size:14px;
      box-sizing:border-box;
    }

    button {
      width:100%;
      padding:12px;
      background:#4CAF50;
      color:white;
      border:none;
      border-radius:5px;
      font-size:15px;
      cursor:pointer;
      margin-top:10px;
    }

    button:hover {
      background:#43a047;
    }
  </style>
</head>

<body>
<div class="box">
  <h2>GenData AI</h2>
  <h4>Your smart test data engineer â€” one Excel away</h4>
  {% if error %}
    <div style="
      background:#ffe6e6;
      color:#b30000;
      padding:10px;
      border-radius:5px;
      text-align:center;
      font-size:14px;
    ">
      {{ error }}
    </div>
    {% endif %}
  <form method="POST" action="/generate" enctype="multipart/form-data">

    <label>Application Name</label>
    <select name="appname" required>
      <option value="" disabled selected>Select Application</option>
      <option value="Default">Default</option>
      <option value="AQ">AQ</option>
      <option value="Quincy">Quincy</option>
      <option value="IMS">IMS</option>
      <option value="SOV">SOV</option>
    </select>

    <label>Upload Excel Template</label>
    <input type="file" name="excel" required>

    <label>Number of Records</label>
    <input type="number" name="records" min="1" value="5">

    <button type="submit">Generate Test Data</button>

  </form>
</div>
</body>
</html>
"""

# ======================= CONFIG =======================

NGROK_TOKEN = "38LPPedNPGIr5Tewd5XPHEYKnCv_47xp8iRD2nRqhwZMpSq2G"
SCHEMA_DIR = "/content/drive/MyDrive/Colab Notebooks"



# ======================= LOAD SCHEMAS =======================

def load_schema(appname):
    for file in os.listdir(SCHEMA_DIR):
        if file.lower() == f"{appname.lower()}.json":
            with open(os.path.join(SCHEMA_DIR, file), "r") as f:
                return json.load(f)
    return None

def load_all_schemas():
    schemas = []
    for file in os.listdir(SCHEMA_DIR):
        if file.endswith(".json"):
            with open(os.path.join(SCHEMA_DIR, file), "r") as f:
                schemas.append(json.load(f))
    return schemas

# ======================= DATA GENERATOR =======================

def generate_value(rule, row_index, row_context):
    gen = rule.get("generation", "random")
    dt = rule.get("datatype")

    #SEQUENCE
    if gen == "sequence":
        return rule.get("start", 1) + (row_index * rule.get("step", 1))

    #SEQUENCE TEXT (Fix for 'Work Item Description')
    if gen == "sequence_text":
        prefix = rule.get("prefix", "")
        return f"{prefix}{row_index + 1}"

    #FIXED
    if gen == "fixed":
        return rule.get("value")

    #RANDOM & OPTIONAL RANDOM
    if gen == "random" or gen == "optional_random":
        if dt == "currency" or dt == "integer":
            min_v = rule.get("min_value", 0)  # Changed from 'min' to 'min_value' to match JSON
            max_v = rule.get("max_value", 100)
            val = random.uniform(min_v, max_v)
            return round(val, 2) if dt == "currency" else int(val)
        if dt == "percentage":
            return round(random.uniform(0, 100), 2)

    #CALCULATED / DERIVED
    if gen in ["calculated", "derived"]:
        formula = rule.get("formula")
        if not formula:
            return None

        # Regex to handle 'random(min,max)' inside the formula string
        if "random(" in formula:
            def replacer(match):
                low = float(match.group(1))
                high = float(match.group(2))
                return str(random.uniform(low, high))
            formula = re.sub(r"random\(([\d\.]+),([\d\.]+)\)", replacer, formula)

        expr = formula
        # Replace column names with actual values from this row
        for k, v in row_context.items():
            if v is not None:
                # Remove currency symbols or extra formatting if necessary
                clean_v = str(v).replace("$", "").replace(",", "")
                expr = expr.replace(k, clean_v)

        try:
            # Safe eval
            return round(eval(expr), 2)
        except Exception as e:
            # print(f"Formula Error: {e}") # Uncomment for debugging
            return 0.00

    return None
# ======================= EXCEL PROCESSOR =======================

def header_match(excel_header, schema_header):
    return normalize(excel_header) == normalize(schema_header)


def normalize(text):
    if text is None:
        return ""
    return re.sub(r"[^a-z0-9]", "", str(text).lower())

def generate_excel(input_path, output_path, records, appname):

    # Load schemas
    if appname != "Default":
        schema = load_schema(appname)
        schemas = [schema] if schema else []
    else:
        schemas = load_all_schemas()

    wb = load_workbook(input_path)

    for sheet in wb.sheetnames:
        ws = wb[sheet]

        # ---- Detect header row ----
        header_row = None


        for r in range(1, 30):
            values = [ws.cell(r, c).value for c in range(1, ws.max_column + 1)]

            # count text-like headers only
            text_cells = [
                v for v in values
                if isinstance(v, str) and len(v.strip()) > 1
            ]

            # SOV headers usually have MANY text columns
            if len(text_cells) >= 5:
                header_row = r
                break


        if not header_row:
            continue

        # ---- Read headers ----
        headers = []
        last_header = None

        for c in range(1, ws.max_column + 1):
            v = ws.cell(header_row, c).value
            if v not in (None, ""):
                last_header = v
                headers.append(v)
            else:
                headers.append(last_header)
        print("Detected headers:", headers)

        # ---- Find first empty row after header ----
        start_row = header_row + 1


        # ---- Write data ----

        # ======================= DATA GENERATION LOOP =======================
        for r in range(start_row, start_row + records):
            row_context = {}

            # PASS 1: Generate non-calculated values (Random, Sequence, Fixed)
            for c, header in enumerate(headers, start=1):
                if not header: continue

                # Find matching rule from schemas
                rule = None
                for schema in schemas:
                    if not schema: continue
                    for col_name, col_rule in schema.get("columns", {}).items():
                        if header_match(header, col_name):
                            rule = col_rule
                            break
                    if rule: break

                if not rule: continue

                # Skip calculated fields for Pass 2
                if rule.get("generation") in ["calculated", "derived"]:
                    continue

                cell = ws.cell(r, c)
                if is_merged_follower_cell(ws, cell): continue

                value = generate_value(rule, r - start_row, row_context)

                # IMPORTANT: Save to context using the EXACT header name for formulas
                row_context[header] = value
                cell.value = value

            # PASS 2: Generate Calculated / Formula values
            # We do this after Pass 1 so all variables like 'Item Value' exist
            for c, header in enumerate(headers, start=1):
                if header in row_context: continue # Already filled in Pass 1

                rule = None
                for schema in schemas:
                    if not schema: continue
                    for col_name, col_rule in schema.get("columns", {}).items():
                        if header_match(header, col_name):
                            rule = col_rule
                            break
                    if rule: break

                if not rule or rule.get("generation") not in ["calculated", "derived"]:
                    continue

                cell = ws.cell(r, c)
                if is_merged_follower_cell(ws, cell): continue

                # Calculate based on the row_context created in Pass 1
                value = generate_value(rule, r - start_row, row_context)

                # Update context so formulas can depend on other formulas (nested)
                row_context[header] = value
                cell.value = value


    wb.save(output_path)


# ======================= FLASK =======================

app = Flask(__name__)

@app.route("/", methods=["GET"])
def home():
    return render_template_string(HTML_FORM)

@app.route("/generate", methods=["POST"])
def generate():
    appname = request.form["appname"]
    records = int(request.form["records"])
    file = request.files["excel"]

    filename = file.filename.lower()

    # Appname vs file validation
    if appname != "Default":
        APP_FILE_KEYWORDS = {
        "IMS": ["ims"],
        "SOV": ["sov", "schedule of value", "schedule-of-value"],
        "Quincy": ["quincy"],
        "AQ": ["aq"]
    }

    if appname != "Default":
        keywords = APP_FILE_KEYWORDS.get(appname, [])
        if not any(k in filename for k in keywords):
            return render_template_string(
                HTML_FORM,
                error=f"Selected application '{appname}' does not match uploaded file '{file.filename}'."
            )


    with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as inp:
        file.save(inp.name)
        input_path = inp.name

    original_name = os.path.splitext(file.filename)[0]
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    output_filename = f"{original_name}_Output_{timestamp}.xlsx"
    output_path = os.path.join(tempfile.gettempdir(), output_filename)


    generate_excel(input_path, output_path, records, appname)

    return send_file(
    output_path,
    as_attachment=True,
    download_name=output_filename
)


# ======================= NGROK =======================

ngrok.set_auth_token(NGROK_TOKEN)
public_url = ngrok.connect(5000)
print("Public URL:", public_url)
print(os.listdir(SCHEMA_DIR))

print("Schema directory files:", os.listdir(SCHEMA_DIR))

test_schema = load_schema("SOV")
if test_schema:
    print("SOV schema loaded")
    print("SOV columns count:", len(test_schema.get("columns", {})))
else:
    print("SOV schema NOT loaded")

app.run(port=5000)
