In [8]:
import tkinter as tk
from tkinter import messagebox, END
import pandas as pd
import string
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split # New Import
import numpy as np

# --- 1. FEATURE CALCULATION FUNCTIONS ---

def count_punct(text):
    """Counts punctuation characters."""
    return sum([1 for char in text if char in string.punctuation])

def count_digits(text):
    """Counts digit characters."""
    return sum([1 for char in text if char.isdigit()])

def calculate_input_features(text):
    """Returns an array of engineered features (length, punct_count, digit_count)."""
    length = len(text)
    punct_count = count_punct(text)
    digit_count = count_digits(text)
    return np.array([[length, punct_count, digit_count]])

# --- 2. MODEL INITIALIZATION AND TRAINING (WITH 70/30 SPLIT) ---

# Global variables for model and scaler (necessary since functions need to access them)
knc_spam = None
scaler_spam = None

try:
    df_spam = pd.read_csv("E:\Data Sets\spam.csv", encoding='latin-1')
    df_spam = df_spam.iloc[:, :2]
    df_spam.columns = ['label', 'message']
    df_spam['label'] = df_spam['label'].map({'ham': 0, 'spam': 1})
    
    # Feature Engineering
    df_spam['length'] = df_spam['message'].apply(len)
    df_spam['punct_count'] = df_spam['message'].apply(count_punct)
    df_spam['digit_count'] = df_spam['message'].apply(count_digits)

    # Prepare features and target
    X_spam = df_spam[['length', 'punct_count', 'digit_count']].values
    y_spam = df_spam['label'].values
    
    # --- STEP: 70% Train / 30% Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(
        X_spam, y_spam, test_size=0.3, random_state=42
    )

    # Scale Features (Essential for KNN) - FIT only on the training data (X_train)
    scaler_spam = StandardScaler()
    X_scaled_train = scaler_spam.fit_transform(X_train)
    
    # Train the KNN model on the training data
    knc_spam = KNeighborsClassifier(n_neighbors=5)
    knc_spam.fit(X_scaled_train, y_train)

    print("Model trained on 70% of data.")
    
except FileNotFoundError:
    messagebox.showerror("Error", "The 'spam.csv' file was not found. Please place it in the same directory.")
    knc_spam = None
except Exception as e:
    messagebox.showerror("Error", f"Failed to train model due to data error: {e}")
    knc_spam = None

# --- 3. TKINTER PREDICTION LOGIC ---

def spam_predict():
    """Retrieves user input, predicts spam/ham, and displays the result."""
    if knc_spam is None:
        messagebox.showwarning("Model Error", "Model is not trained. Check for errors at startup.")
        return

    # 1. Get user input
    message = message_entry.get("1.0", "end-1c").strip()
    
    if not message:
        messagebox.showwarning("Input Error", "Please enter a message.")
        return

    try:
        # 2. Calculate features and scale (using the fitted scaler_spam)
        input_features = calculate_input_features(message)
        input_scaled = scaler_spam.transform(input_features) # No fitting here!

        # 3. Predict
        prediction = knc_spam.predict(input_scaled)[0]
        
        result = "SPAM" if prediction == 1 else "HAM"
        
        # 4. Display result
        result_entry.delete(0, END)
        result_entry.insert(0, result)

    except Exception as e:
        messagebox.showerror("Prediction Error", f"Invalid input format or calculation error: {e}")

def clear_spam_entries():
    """Clears the text input and result fields."""
    message_entry.delete("1.0", END)
    result_entry.delete(0, END)

# --- 4. TKINTER GUI SETUP ---

def on_closing():
    """Custom function for graceful exit."""
    if messagebox.askokcancel("Exit", "Do you really want to exit?"):
        mywin_spam.destroy()

# The main window setup
mywin_spam = tk.Tk()
mywin_spam.state("zoomed")
mywin_spam.resizable(width=False, height=False)
mywin_spam.configure(bg="orange")
mywin_spam.title("KNN Spam Detector (70/30 Split)")
mywin_spam.protocol("WM_DELETE_WINDOW", on_closing)

lbl_title = tk.Label(mywin_spam, text="Spam Message Detection", font=('', 40, 'bold', 'underline'), bg='lightgreen')
lbl_title.place(relx=.1, rely=0)

frm = tk.Frame(mywin_spam, bg='sky blue')
frm.place(relx=0, rely=.12, relwidth=1, relheight=1)

# Input Labels and Entry
message_lbl = tk.Label(frm, text="Enter Message:", font=('', 20, 'bold'), bg='Gold')
message_lbl.place(relx=.2, rely=.2)

# Note: Tkinter Text widget is used for multi-line input
message_entry = tk.Text(frm, font=('', 14), bd=5, height=10, width=60) 
message_entry.place(relx=.4, rely=.2)

# Result Label and Entry
result_lbl = tk.Label(frm, text="Prediction:", font=('', 20, 'bold'), bg='Gold')
result_lbl.place(relx=.2, rely=.6)

result_entry = tk.Entry(frm, font=('', 20, 'bold'), bd=5, fg='blue')
result_entry.place(relx=.4, rely=.6)

# Buttons
btn_predict = tk.Button(frm, command=spam_predict, text="Predict", font=('', 20, 'bold'), bd=10, width=10, bg='lightgreen')
btn_predict.place(relx=.4, rely=.75)

btn_clear = tk.Button(frm, command=clear_spam_entries, text="Clear", font=('', 20, 'bold'), bd=10, width=10, bg='yellow')
btn_clear.place(relx=.6, rely=.75)

mywin_spam.mainloop()

Model trained on 70% of data.
