In [None]:
import pandas as pd
import numpy as np
import ast
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# ======================================================
# 0. DOWNLOAD NLP RESOURCES (only first time)
# ======================================================
nltk.download('stopwords')
nltk.download('wordnet')

# ======================================================
# 1. LOAD DATASET
# ======================================================
df = pd.read_excel("/content/drive/MyDrive/tnew.xlsx")

# ======================================================
# 2. DATE & TIMESTAMP CONVERSION
# ======================================================
df["Date"] = pd.to_datetime("1899-12-30") + pd.to_timedelta(df["Date"], unit="D")
df["Timestamp"] = pd.to_datetime("1899-12-30") + pd.to_timedelta(df["Timestamp"], unit="D")

df["YearMonth"] = df["Date"].dt.to_period("M").astype(str)
df["Week"] = df["Date"].dt.to_period("W").astype(str)

# ======================================================
# 3. COMPLAINT COLUMN EXPANSION
# ======================================================
def parse_complaint(val):
    if pd.isna(val) or val == "":
        return {}
    try:
        fixed_val = str(val).replace("true", "True").replace("false", "False").replace("null", "None")
        return ast.literal_eval(fixed_val)
    except:
        return {}

df["Complaint_dict"] = df["Complaint"].apply(parse_complaint)
complaint_expanded = pd.json_normalize(df["Complaint_dict"]).add_prefix("Complaint_")

# ======================================================
# 4. FLAGS COLUMN EXPANSION
# ======================================================
def parse_flags(val):
    if pd.isna(val) or val in ["", "[]"]:
        return []
    try:
        fixed_val = str(val).replace("true", "True").replace("false", "False").replace("null", "None")
        parsed = ast.literal_eval(fixed_val)
        if isinstance(parsed, list):
            return parsed
        elif isinstance(parsed, dict):
            return [parsed]
        else:
            return []
    except:
        return []

df["Flags_list"] = df["Call Flags"].apply(parse_flags)
max_flags = df["Flags_list"].apply(len).max()

expanded_flags = pd.DataFrame()
for i in range(max_flags):
    temp = df["Flags_list"].apply(lambda x: x[i] if i < len(x) else {})
    temp_df = pd.json_normalize(temp)
    temp_df = temp_df.rename(columns={k: f"Flag{i+1}_{k}" for k in temp_df.columns})
    expanded_flags = pd.concat([expanded_flags, temp_df], axis=1)

# ======================================================
# 5. AGENT KPI METRICS
# ======================================================
df["Inbound Handled"] = df["Inbound Handled"].fillna(0)
df["Outbound Handled"] = df["Outbound Handled"].fillna(0)
df["Total_Calls"] = df["Inbound Handled"] + df["Outbound Handled"]

df["AHT"] = (df["Inbound Handle Time"] + df["Outbound Handle Time"]) / df["Total_Calls"].replace(0, np.nan)
df["Avg_Talk_Time"] = df["Talk Time"] / df["Total_Calls"].replace(0, np.nan)
df["Avg_ACW_Time"] = df["ACW Time"] / df["Total_Calls"].replace(0, np.nan)

df["Occupancy_Pct"] = (df["Handle Time"] / df["Login Time"].replace(0, np.nan)) * 100
df["Availability_Pct"] = (df["Available Time"] / df["Login Time"].replace(0, np.nan)) * 100

df["SLA_Pct"] = df["In SLA"] / (df["In SLA"] + df["Out SLA"]).replace(0, np.nan)

df["Complaint_Flag"] = df["Complaint_detected"].astype(int) if "Complaint_detected" in df.columns else 0

# ======================================================
# 6. MERGE ALL EXPANSIONS
# ======================================================
df_final = pd.concat(
    [df.drop(columns=["Complaint_dict", "Flags_list"]),
     complaint_expanded,
     expanded_flags],
    axis=1
)

# Save intermediate file
df_final.to_csv("final_preprocessed_dataset.csv", index=False)

# ======================================================
# 7. NLP PREPROCESSING ON SUMMARY COLUMN
# ======================================================
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

extra_stops = {"please", "kindly", "hello", "hi", "thank", "thanks", "sir", "madam", ""}
stop_words.update(extra_stops)

def preprocess_summary(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

df_final["Summary_Preprocessed"] = df_final["Summary"].astype(str).apply(preprocess_summary)

# ======================================================
# 8. SAVE FINAL DATASET
# ======================================================
df_final.to_csv("final_preprocessed_dataset_updated.csv", index=False)

print("ðŸŽ‰ FINAL FILE CREATED SUCCESSFULLY â†’ final_preprocessed_dataset_updated.csv")
