In [None]:
import tkinter as tk
from tkinter import messagebox, ttk
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import tree
from scipy.cluster.hierarchy import dendrogram, linkage
import pandas as pd
import networkx as nx
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from collections import Counter
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from math import log2
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.tree import DecisionTreeRegressor
import flask

In [16]:
def visualize_apriori_fp_growth():
    # Create a new window (Toplevel) for displaying plots
    window = tk.Toplevel()
    window.title("Apriori & FP-Growth Visualizations")
    window.geometry("800x600")
    
    # Create a Notebook (tabbed interface)
    notebook = ttk.Notebook(window)
    notebook.pack(fill='both', expand=True)

    # --- Load Dataset ---
    dataset_path = "EPL.csv"  # Adjust the path if needed
    transactions = []
    with open(dataset_path, "r") as f:
        for line in f:
            transactions.append(line.strip().split(','))
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_ary, columns=te.columns_)

    # --- Define Functions for Association Rule Mining ---
    def get_association_rules(min_support, min_confidence, min_lift, min_length):
        frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)
        rules = association_rules(frequent_itemsets, metric="lift", min_threshold=min_lift)
        rules = rules[(rules['confidence'] >= min_confidence) &
                      (rules['lift'] >= min_lift) &
                      (rules['antecedents'].apply(lambda x: len(x) >= min_length - 1)) &
                      (rules['consequents'].apply(lambda x: len(x) >= 1))]
        return rules

    def get_fp_growth_rules(df, min_support, min_confidence, min_lift, min_length):
        frequent_itemsets = fpgrowth(df, min_support=min_support, use_colnames=True)
        rules = association_rules(frequent_itemsets, metric="lift", min_threshold=min_lift)
        rules = rules[(rules['confidence'] >= min_confidence) &
                      (rules['lift'] >= min_lift) &
                      (rules['antecedents'].apply(lambda x: len(x) >= min_length - 1)) &
                      (rules['consequents'].apply(lambda x: len(x) >= 1))]
        rules = rules.sort_values(by=['lift', 'confidence'], ascending=False)
        return rules

    # --- Generate Rules for Apriori ---
    rules1 = get_association_rules(min_support=0.1, min_confidence=0.1, min_lift=2, min_length=2)
    rules2 = get_association_rules(min_support=0.12, min_confidence=0.12, min_lift=2, min_length=2)

    # --- Tab 1: Scatter Plot (Apriori Association Rules) ---
    tab1 = ttk.Frame(notebook)
    notebook.add(tab1, text="Apriori Scatter")
    fig1 = plt.Figure(figsize=(6, 4), dpi=100)
    ax1 = fig1.add_subplot(111)
    sc1 = ax1.scatter(rules1['support'], rules1['confidence'], c=rules1['lift'], cmap='coolwarm')
    fig1.colorbar(sc1, ax=ax1, label="Lift")
    ax1.set_xlabel("Support")
    ax1.set_ylabel("Confidence")
    ax1.set_title("Scatter Plot of Apriori Association Rules")
    canvas1 = FigureCanvasTkAgg(fig1, master=tab1)
    canvas1.draw()
    canvas1.get_tk_widget().pack(fill='both', expand=True)

    # --- Tab 2: Association Rules Graph ---
    tab2 = ttk.Frame(notebook)
    notebook.add(tab2, text="Association Graph")
    fig2 = plt.Figure(figsize=(6, 4), dpi=100)
    ax2 = fig2.add_subplot(111)
    G = nx.DiGraph()
    # Use top 10 rules from rules2 for the graph
    for _, rule in rules2.head(10).iterrows():
        for antecedent in rule['antecedents']:
            for consequent in rule['consequents']:
                G.add_edge(antecedent, consequent, weight=rule['lift'])
    pos = nx.kamada_kawai_layout(G)
    nx.draw(G, pos, ax=ax2, edge_color='gray', node_color='lightblue', with_labels=True, font_size=8)
    edge_labels = {(u, v): f"{d['weight']:.2f}" for u, v, d in G.edges(data=True)}
    nx.draw_networkx_edge_labels(G, pos, ax=ax2, edge_labels=edge_labels)
    ax2.set_title("Graph of Association Rules")
    canvas2 = FigureCanvasTkAgg(fig2, master=tab2)
    canvas2.draw()
    canvas2.get_tk_widget().pack(fill='both', expand=True)

    # --- Tab 3: Scatter Plot (FP-Growth Association Rules) ---
    tab3 = ttk.Frame(notebook)
    notebook.add(tab3, text="FP-Growth Scatter")
    rules1_fp = get_fp_growth_rules(df, min_support=0.1, min_confidence=0.1, min_lift=2, min_length=2)
    rules2_fp = get_fp_growth_rules(df, min_support=0.12, min_confidence=0.12, min_lift=2, min_length=2)
    fig3 = plt.Figure(figsize=(6, 4), dpi=100)
    ax3 = fig3.add_subplot(111)
    sc3 = ax3.scatter(rules1_fp['support'], rules1_fp['confidence'], c=rules1_fp['lift'], cmap='coolwarm')
    fig3.colorbar(sc3, ax=ax3, label="Lift")
    ax3.set_xlabel("Support")
    ax3.set_ylabel("Confidence")
    ax3.set_title("Scatter Plot of FP-Growth Association Rules")
    canvas3 = FigureCanvasTkAgg(fig3, master=tab3)
    canvas3.draw()
    canvas3.get_tk_widget().pack(fill='both', expand=True)


In [17]:
def visualize_bayes():
    # Load and preprocess data
    df = pd.read_csv('EPL.csv')
    df = df.drop(columns=["MatchID", "Date", "Time", "Referee"])

    for column in df.columns:
        if df[column].dtype == "object":
            df[column] = df[column].fillna(df[column].mode()[0])
        else:
            df[column] = df[column].fillna(df[column].median())  

    categorical_columns = ["Season", "HomeTeam", "AwayTeam", "FullTimeResult", "HalfTimeResult"]
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    X = df.drop(columns=["FullTimeResult"])  
    y = df["FullTimeResult"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = GaussianNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")
    
    cm = confusion_matrix(y_test, y_pred)
    
    # --- Window for Confusion Matrix ---
    cm_window = tk.Toplevel()
    cm_window.title("Confusion Matrix - Naïve Bayes")
    cm_window.geometry("700x500")
    
    fig_cm = plt.Figure(figsize=(6, 4), dpi=100)
    ax_cm = fig_cm.add_subplot(111)
    heatmap = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
                          xticklabels=label_encoders["FullTimeResult"].classes_, 
                          yticklabels=label_encoders["FullTimeResult"].classes_,
                          ax=ax_cm)
    ax_cm.set_xlabel("Predicted Label")
    ax_cm.set_ylabel("True Label")
    ax_cm.set_title("Confusion Matrix - Naïve Bayes")
    
    canvas_cm = FigureCanvasTkAgg(fig_cm, master=cm_window)
    canvas_cm.draw()
    canvas_cm.get_tk_widget().pack(fill="both", expand=True)
    
    # --- Window for Metrics ---
    metrics_window = tk.Toplevel()
    metrics_window.title("Naïve Bayes Metrics")
    metrics_window.geometry("400x250")
    
    # Create a frame to hold the labels
    frame = ttk.Frame(metrics_window, padding=20)
    frame.pack(fill="both", expand=True)
    
    # Create and grid labels for the scores
    ttk.Label(frame, text=f"Accuracy: {accuracy:.4f}", font=("Arial", 12)).grid(row=0, column=0, sticky="w", pady=5)
    ttk.Label(frame, text=f"Precision: {precision:.4f}", font=("Arial", 12)).grid(row=1, column=0, sticky="w", pady=5)
    ttk.Label(frame, text=f"Recall: {recall:.4f}", font=("Arial", 12)).grid(row=2, column=0, sticky="w", pady=5)
    ttk.Label(frame, text=f"F1 Score: {f1:.4f}", font=("Arial", 12)).grid(row=3, column=0, sticky="w", pady=5)


In [18]:
def visualize_id3():
    # Load and preprocess data
    df = pd.read_csv('EPL.csv')
    df = df[['HomeTeam', 'AwayTeam', 'FullTimeResult', 'HalfTimeResult']]
    df = df.apply(lambda x: pd.factorize(x)[0])
    
    # Helper functions for entropy and information gain
    def entropy(data):
        total = len(data)
        counts = Counter(data)
        return -sum((count/total) * log2(count/total) for count in counts.values())
    
    def information_gain(df, feature, target):
        total_entropy = entropy(df[target])
        values = df[feature].unique()
        weighted_entropy = sum(
            (len(df[df[feature] == v]) / len(df)) * entropy(df[df[feature] == v][target])
            for v in values
        )
        return total_entropy - weighted_entropy
    
    # Calculate information gain for each feature
    info_gains = {feature: information_gain(df, feature, 'FullTimeResult') for feature in df.columns[:-1]}
    best_feature = max(info_gains, key=info_gains.get)
    print("\nInformation Gain for Each Feature:")
    for feature, gain in info_gains.items():
        print(f"{feature}: {gain:.4f}")
    print(f"\nBest Feature for Splitting: {best_feature}")
    
    # Prepare training data
    X = df.drop(columns=['FullTimeResult'])
    y = df['FullTimeResult']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    # Train the Decision Tree (ID3)
    tree_model = DecisionTreeClassifier(criterion="entropy", max_depth=3, 
                                          min_samples_split=10, min_samples_leaf=5)
    tree_model.fit(X_train, y_train)
    
    # --- Create new window for Decision Tree plot ---
    tree_window = tk.Toplevel()
    tree_window.title("ID3 Decision Tree")
    tree_window.geometry("800x600")
    
    fig_tree = plt.Figure(figsize=(10, 6), dpi=100)
    ax_tree = fig_tree.add_subplot(111)
    plot_tree(tree_model, feature_names=X.columns, class_names=['H', 'D', 'A'], 
              filled=True, ax=ax_tree)
    
    canvas_tree = FigureCanvasTkAgg(fig_tree, master=tree_window)
    canvas_tree.draw()
    canvas_tree.get_tk_widget().pack(fill="both", expand=True)
    
    # Calculate feature importances (optional)
    importances = tree_model.feature_importances_
    feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
    feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
    
    # Predict and compute metrics
    y_pred = tree_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # --- Create new window for Metrics display ---
    metrics_window = tk.Toplevel()
    metrics_window.title("ID3 Metrics")
    metrics_window.geometry("400x300")
    
    frame = ttk.Frame(metrics_window, padding=20)
    frame.pack(fill="both", expand=True)
    
    ttk.Label(frame, text=f"Accuracy: {accuracy:.4f}", font=("Arial", 12)).pack(anchor="w", pady=5)
    ttk.Label(frame, text=f"Precision: {precision:.4f}", font=("Arial", 12)).pack(anchor="w", pady=5)
    ttk.Label(frame, text=f"Recall: {recall:.4f}", font=("Arial", 12)).pack(anchor="w", pady=5)
    ttk.Label(frame, text=f"F1 Score: {f1:.4f}", font=("Arial", 12)).pack(anchor="w", pady=5)


In [19]:
def visualize_cart():
    # Load and preprocess data
    df = pd.read_csv('EPL.csv')
    df = df[['HomeTeam', 'AwayTeam', 'FullTimeResult', 'HalfTimeResult']]
    df = df.apply(lambda x: pd.factorize(x)[0])
    
    # Define entropy function
    def entropy(data):
        total = len(data)
        counts = Counter(data)
        return -sum((count/total) * log2(count/total) for count in counts.values())
    
    target_entropy = entropy(df['HalfTimeResult'])
    
    # Define information gain function
    def information_gain(df, feature, target):
        total_entropy = entropy(df[target])
        values = df[feature].unique()
        weighted_entropy = sum(
            (len(df[df[feature] == v]) / len(df)) * entropy(df[df[feature] == v][target])
            for v in values
        )
        return total_entropy - weighted_entropy
    
    info_gains = {feature: information_gain(df, feature, 'FullTimeResult') for feature in df.columns[:-1]}
    best_feature = max(info_gains, key=info_gains.get)
    
    # Prepare training data
    X = df.drop(columns=['FullTimeResult'])
    y = df['FullTimeResult']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    # Initialize and fit the CART model (using Gini impurity)
    cart_tree = DecisionTreeClassifier(criterion="gini", max_depth=3, 
                                         min_samples_split=10, min_samples_leaf=5)
    cart_tree.fit(X_train, y_train)
    
    # --- Window 1: CART Decision Tree Plot ---
    tree_window = tk.Toplevel()
    tree_window.title("CART Decision Tree")
    tree_window.geometry("800x600")
    
    fig_tree = plt.Figure(figsize=(10, 6), dpi=100)
    ax_tree = fig_tree.add_subplot(111)
    plot_tree(cart_tree, feature_names=X.columns, class_names=['H', 'D', 'A'],
              filled=True, ax=ax_tree)
    
    canvas_tree = FigureCanvasTkAgg(fig_tree, master=tree_window)
    canvas_tree.draw()
    canvas_tree.get_tk_widget().pack(fill="both", expand=True)
    
    # --- Window 2: Metrics Display ---
    y_pred = cart_tree.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    metrics_window = tk.Toplevel()
    metrics_window.title("CART Metrics")
    metrics_window.geometry("400x300")
    
    frame_metrics = ttk.Frame(metrics_window, padding=20)
    frame_metrics.pack(fill="both", expand=True)
    
    ttk.Label(frame_metrics, text=f"Accuracy: {accuracy:.4f}", font=("Arial", 12)).pack(anchor="w", pady=5)
    ttk.Label(frame_metrics, text=f"Precision: {precision:.4f}", font=("Arial", 12)).pack(anchor="w", pady=5)
    ttk.Label(frame_metrics, text=f"Recall: {recall:.4f}", font=("Arial", 12)).pack(anchor="w", pady=5)
    ttk.Label(frame_metrics, text=f"F1 Score: {f1:.4f}", font=("Arial", 12)).pack(anchor="w", pady=5)
    
    # --- Window 3: Correlation Heatmap ---
    corr_matrix = df.corr()
    
    heatmap_window = tk.Toplevel()
    heatmap_window.title("Feature Correlation Heatmap")
    heatmap_window.geometry("800x600")
    
    fig_heat = plt.Figure(figsize=(10, 8), dpi=100)
    ax_heat = fig_heat.add_subplot(111)
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5, ax=ax_heat)
    ax_heat.set_title("Feature Correlation Heatmap")
    
    canvas_heat = FigureCanvasTkAgg(fig_heat, master=heatmap_window)
    canvas_heat.draw()
    canvas_heat.get_tk_widget().pack(fill="both", expand=True)


In [20]:
def visualize_c45():
    # Load and preprocess data
    df = pd.read_csv('EPL.csv')
    df = df[['HomeTeam', 'AwayTeam', 'FullTimeResult', 'HalfTimeResult']]
    df = df.apply(lambda x: pd.factorize(x)[0])
    
    # Define entropy function
    def entropy(data):
        total = len(data)
        counts = Counter(data)
        return -sum((count/total) * log2(count/total) for count in counts.values())
    
    target_entropy = entropy(df['HalfTimeResult'])
    
    # Prepare training data
    X = df.drop(columns=['FullTimeResult'])
    y = df['FullTimeResult']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    # Initialize and fit the C4.5 model (using entropy)
    c45_tree = DecisionTreeClassifier(criterion="entropy", max_depth=3, 
                                      min_samples_split=10, min_samples_leaf=5)
    c45_tree.fit(X_train, y_train)
    
    # --- Window 1: Decision Tree Plot ---
    tree_window = tk.Toplevel()
    tree_window.title("C4.5 Decision Tree")
    tree_window.geometry("800x600")
    
    fig_tree = plt.Figure(figsize=(10, 6), dpi=100)
    ax_tree = fig_tree.add_subplot(111)
    plot_tree(c45_tree, feature_names=X.columns, class_names=['H', 'D', 'A'], 
              filled=True, ax=ax_tree)
    
    canvas_tree = FigureCanvasTkAgg(fig_tree, master=tree_window)
    canvas_tree.draw()
    canvas_tree.get_tk_widget().pack(fill="both", expand=True)
    
    # --- Window 2: Metrics Display ---
    y_pred = c45_tree.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    metrics_window = tk.Toplevel()
    metrics_window.title("C4.5 Metrics")
    metrics_window.geometry("400x300")
    
    frame = ttk.Frame(metrics_window, padding=20)
    frame.pack(fill="both", expand=True)
    
    ttk.Label(frame, text=f"Accuracy: {accuracy:.4f}", font=("Arial", 12)).pack(anchor="w", pady=5)
    ttk.Label(frame, text=f"Precision: {precision:.4f}", font=("Arial", 12)).pack(anchor="w", pady=5)
    ttk.Label(frame, text=f"Recall: {recall:.4f}", font=("Arial", 12)).pack(anchor="w", pady=5)
    ttk.Label(frame, text=f"F1 Score: {f1:.4f}", font=("Arial", 12)).pack(anchor="w", pady=5)


In [None]:
def visualize_knn():
    # Load data and define features/target
    df = pd.read_csv("EPL.csv") 
    features = [
        "HomeTeamShots", "AwayTeamShots", "HomeTeamShotsOnTarget", "AwayTeamShotsOnTarget",
        "HomeTeamCorners", "AwayTeamCorners", "HomeTeamFouls", "AwayTeamFouls",
        "HomeTeamYellowCards", "AwayTeamYellowCards", "B365HomeTeam", "B365Draw", "B365AwayTeam"
    ]
    target = "FullTimeResult"
    
    # Encode target labels
    le = LabelEncoder()
    df[target] = le.fit_transform(df[target])
    df.dropna(subset=features + [target], inplace=True)

    # Prepare features and labels
    X = df[features]
    y = df[target]
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    # Train KNN with k=100
    k = 100
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    # Compute metrics and report
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=le.classes_)
    cm = confusion_matrix(y_test, y_pred)
    
    # --- Window 1: Classification Report --- 
    report_window = tk.Toplevel()
    report_window.title("KNN Classification Report")
    report_window.geometry("500x400")
    
    # Create a frame and a Text widget to display the report
    frame_report = ttk.Frame(report_window, padding=10)
    frame_report.pack(fill="both", expand=True)
    
    report_text = tk.Text(frame_report, wrap="word", font=("Consolas", 10))
    report_text.insert("1.0", f"Accuracy: {accuracy:.4f}\n\n{report}")
    report_text.config(state="disabled")  # Make the text widget read-only
    report_text.pack(fill="both", expand=True)
    
    # --- Window 2: Confusion Matrix Plot ---
    cm_window = tk.Toplevel()
    cm_window.title("KNN Confusion Matrix")
    cm_window.geometry("600x500")
    
    fig_cm = plt.Figure(figsize=(6, 4), dpi=100)
    ax_cm = fig_cm.add_subplot(111)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
                xticklabels=le.classes_, yticklabels=le.classes_, ax=ax_cm)
    ax_cm.set_xlabel("Predicted Label")
    ax_cm.set_ylabel("True Label")
    ax_cm.set_title("Confusion Matrix")
    
    canvas_cm = FigureCanvasTkAgg(fig_cm, master=cm_window)
    canvas_cm.draw()
    canvas_cm.get_tk_widget().pack(fill="both", expand=True)


In [22]:
def visualize_kmeans():
    # Load data and fill missing numeric values
    df = pd.read_csv("EPL.csv")
    df.fillna(df.mean(numeric_only=True), inplace=True)

    # Create new features
    df["TotalGoals"] = df["FullTimeHomeTeamGoals"] + df["FullTimeAwayTeamGoals"]
    df["GoalDifference"] = df["FullTimeHomeTeamGoals"] - df["FullTimeAwayTeamGoals"]
    df["WinRate"] = df["HomeTeamPoints"] / (df["HomeTeamPoints"] + df["AwayTeamPoints"])

    # Scale features for clustering
    features = df[["TotalGoals", "GoalDifference", "WinRate"]]
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)
    
    # Compute inertia for a range of k values (Elbow Method)
    inertia = []
    K_range = range(1, 11)
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(scaled_features)
        inertia.append(kmeans.inertia_)
    
    # --- Window 1: Elbow Method Plot ---
    elbow_window = tk.Toplevel()
    elbow_window.title("Elbow Method for Optimal K")
    elbow_window.geometry("800x600")
    
    fig_elbow = plt.Figure(figsize=(8, 6), dpi=100)
    ax_elbow = fig_elbow.add_subplot(111)
    ax_elbow.plot(K_range, inertia, marker='o')
    ax_elbow.set_xlabel('Number of Clusters (K)')
    ax_elbow.set_ylabel('Inertia')
    ax_elbow.set_title('Elbow Method for Optimal K')
    
    canvas_elbow = FigureCanvasTkAgg(fig_elbow, master=elbow_window)
    canvas_elbow.draw()
    canvas_elbow.get_tk_widget().pack(fill="both", expand=True)
    
    # --- K-Means Clustering ---
    # For this example, we set optimal_k to 10 (you can compute or adjust as needed)
    optimal_k = 10
    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    df["Cluster"] = kmeans.fit_predict(scaled_features)
    
    # Compute silhouette score if there is more than one cluster
    silhouette_avg = None
    if len(set(df["Cluster"])) > 1:
        silhouette_avg = silhouette_score(scaled_features, df["Cluster"])
    
    # --- Window 2: Clustering Scatter Plot ---
    scatter_window = tk.Toplevel()
    title = "K-Means Clustering of EPL Teams"
    if silhouette_avg is not None:
        title += f" (Silhouette Score: {silhouette_avg:.4f})"
    scatter_window.title(title)
    scatter_window.geometry("900x600")
    
    fig_scatter = plt.Figure(figsize=(9, 6), dpi=100)
    ax_scatter = fig_scatter.add_subplot(111)
    sc = ax_scatter.scatter(df["TotalGoals"], df["WinRate"], c=df["Cluster"], cmap="viridis", edgecolors='k')
    ax_scatter.set_xlabel("Total Goals")
    ax_scatter.set_ylabel("Win Rate")
    ax_scatter.set_title("K-Means Clustering of EPL Teams")
    fig_scatter.colorbar(sc, ax=ax_scatter, label="Cluster")
    
    canvas_scatter = FigureCanvasTkAgg(fig_scatter, master=scatter_window)
    canvas_scatter.draw()
    canvas_scatter.get_tk_widget().pack(fill="both", expand=True)


In [23]:
def visualize_agglomerative():
    # Step 1: Load and process data
    df = pd.read_csv("EPL.csv")
    df["TotalGoals"] = df["FullTimeHomeTeamGoals"] + df["FullTimeAwayTeamGoals"]
    df["GoalDifference"] = df["FullTimeHomeTeamGoals"] - df["FullTimeAwayTeamGoals"]
    df["WinRate"] = df["HomeTeamPoints"] / (df["HomeTeamPoints"] + df["AwayTeamPoints"])

    features = df[["TotalGoals", "GoalDifference", "WinRate"]]
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)

    # Step 2: Tkinter window for Dendrogram
    dendro_window = tk.Toplevel()
    dendro_window.title("Dendrogram - Agglomerative Clustering")
    dendro_window.geometry("1000x700")

    fig_dendro = plt.Figure(figsize=(10, 6), dpi=100)
    ax_dendro = fig_dendro.add_subplot(111)
    sch.dendrogram(sch.linkage(scaled_features, method='ward'), ax=ax_dendro)
    ax_dendro.set_title('Dendrogram for Agglomerative Clustering')
    ax_dendro.set_xlabel('Teams')
    ax_dendro.set_ylabel('Euclidean Distance')

    canvas_dendro = FigureCanvasTkAgg(fig_dendro, master=dendro_window)
    canvas_dendro.draw()
    canvas_dendro.get_tk_widget().pack(fill="both", expand=True)

    # Step 3: Apply Agglomerative Clustering
    optimal_k = 6
    agglo_cluster = AgglomerativeClustering(n_clusters=optimal_k, linkage='complete', metric='euclidean')
    df["Cluster"] = agglo_cluster.fit_predict(scaled_features)

    # Step 4: Tkinter window for clustering scatter plot
    cluster_window = tk.Toplevel()
    silhouette_avg = None
    if len(set(df["Cluster"])) > 1:
        silhouette_avg = silhouette_score(scaled_features, df["Cluster"])

    title = "Agglomerative Clustering of EPL Teams"
    if silhouette_avg:
        title += f" (Silhouette Score: {silhouette_avg:.4f})"
    cluster_window.title(title)
    cluster_window.geometry("1000x700")

    fig_scatter = plt.Figure(figsize=(10, 6), dpi=100)
    ax_scatter = fig_scatter.add_subplot(111)
    sc = ax_scatter.scatter(df["TotalGoals"], df["WinRate"], c=df["Cluster"], cmap="rainbow", edgecolors='k')
    ax_scatter.set_xlabel("Total Goals")
    ax_scatter.set_ylabel("Win Rate")
    ax_scatter.set_title("Agglomerative Clustering of EPL Teams")
    fig_scatter.colorbar(sc, ax=ax_scatter, label="Cluster")

    canvas_scatter = FigureCanvasTkAgg(fig_scatter, master=cluster_window)
    canvas_scatter.draw()
    canvas_scatter.get_tk_widget().pack(fill="both", expand=True)


In [24]:
def visualize_dbscan():
    # Step 1: Load and engineer features
    df = pd.read_csv("EPL.csv")
    df["TotalGoals"] = df["FullTimeHomeTeamGoals"] + df["FullTimeAwayTeamGoals"]
    df["GoalDifference"] = df["FullTimeHomeTeamGoals"] - df["FullTimeAwayTeamGoals"]
    df["WinRate"] = df["HomeTeamPoints"] / (df["HomeTeamPoints"] + df["AwayTeamPoints"])

    features = df[["TotalGoals", "GoalDifference", "WinRate"]]
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)

    # Step 2: K-distance graph to estimate epsilon
    k = 10
    nbrs = NearestNeighbors(n_neighbors=k, metric='euclidean').fit(scaled_features)
    distances, indices = nbrs.kneighbors(scaled_features)
    distances = np.sort(distances[:, k - 1])  # 10th neighbor distance

    # Tkinter window for K-distance graph
    kd_window = tk.Toplevel()
    kd_window.title("DBSCAN - K-Distance Graph")
    kd_window.geometry("1000x700")

    fig_kd = plt.Figure(figsize=(10, 6), dpi=100)
    ax_kd = fig_kd.add_subplot(111)
    ax_kd.plot(distances)
    ax_kd.set_xlabel('Data Points Sorted by Distance')
    ax_kd.set_ylabel(f'{k}th Nearest Neighbor Distance')
    ax_kd.set_title('K-Distance Graph to Determine Epsilon')

    canvas_kd = FigureCanvasTkAgg(fig_kd, master=kd_window)
    canvas_kd.draw()
    canvas_kd.get_tk_widget().pack(fill="both", expand=True)

    # Step 3: Apply DBSCAN
    dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean')
    df["Cluster"] = dbscan.fit_predict(scaled_features)

    # Step 4: Tkinter window for clustering scatter plot
    dbscan_window = tk.Toplevel()
    silhouette_avg = None
    if len(set(df["Cluster"])) > 1 and -1 not in set(df["Cluster"]):
        silhouette_avg = silhouette_score(scaled_features, df["Cluster"])

    title = "DBSCAN Clustering of EPL Teams"
    if silhouette_avg:
        title += f" (Silhouette Score: {silhouette_avg:.4f})"
    dbscan_window.title(title)
    dbscan_window.geometry("1000x700")

    fig_scatter = plt.Figure(figsize=(10, 6), dpi=100)
    ax_scatter = fig_scatter.add_subplot(111)
    sc = ax_scatter.scatter(df["TotalGoals"], df["WinRate"], c=df["Cluster"], cmap="rainbow", edgecolors='k')
    ax_scatter.set_xlabel("Total Goals")
    ax_scatter.set_ylabel("Win Rate")
    ax_scatter.set_title("DBSCAN Clustering of EPL Teams")
    fig_scatter.colorbar(sc, ax=ax_scatter, label="Cluster")

    canvas_scatter = FigureCanvasTkAgg(fig_scatter, master=dbscan_window)
    canvas_scatter.draw()
    canvas_scatter.get_tk_widget().pack(fill="both", expand=True)


In [None]:
def visualize_final_prediction():
    df = pd.read_csv('EPL.csv')
    def predict():
        def compute_team_stats(df):
            team_stats = {}

            for team in pd.unique(df[['HomeTeam', 'AwayTeam']].values.ravel('K')):
                home_matches = df[df['HomeTeam'] == team]
                away_matches = df[df['AwayTeam'] == team]

                home_wins = (home_matches['FullTimeResult'] == 'H').sum()
                away_wins = (away_matches['FullTimeResult'] == 'A').sum()
                home_games = len(home_matches)
                away_games = len(away_matches)

                home_win_pct = home_wins / home_games if home_games > 0 else 0
                away_win_pct = away_wins / away_games if away_games > 0 else 0

                home_avg_goals = home_matches['FullTimeHomeTeamGoals'].mean() if home_games > 0 else 0
                away_avg_goals = away_matches['FullTimeAwayTeamGoals'].mean() if away_games > 0 else 0
                home_avg_shots = home_matches['HomeTeamShots'].mean() if home_games > 0 else 0
                away_avg_shots = away_matches['AwayTeamShots'].mean() if away_games > 0 else 0

                home_avg_shots_on_target = home_matches['HomeTeamShotsOnTarget'].mean() if home_games > 0 else 0
                away_avg_shots_on_target = away_matches['AwayTeamShotsOnTarget'].mean() if away_games > 0 else 0
                home_avg_fouls = home_matches['HomeTeamFouls'].mean() if home_games > 0 else 0
                away_avg_fouls = away_matches['AwayTeamFouls'].mean() if away_games > 0 else 0

                team_stats[team] = {
                    'home_win_pct': home_win_pct,
                    'away_win_pct': away_win_pct,
                    'home_avg_goals': home_avg_goals,
                    'away_avg_goals': away_avg_goals,
                    'home_avg_shots': home_avg_shots,
                    'away_avg_shots': away_avg_shots,
                    'home_avg_shots_on_target': home_avg_shots_on_target,
                    'away_avg_shots_on_target': away_avg_shots_on_target,
                    'home_avg_fouls': home_avg_fouls,
                    'away_avg_fouls': away_avg_fouls
                }

            return team_stats
        
        team_stats = compute_team_stats(df)

        features = []
        goal_labels = []
        shot_labels = []
        shot_on_target_labels = []
        foul_labels = []

        for _, row in df.iterrows():
            home_team = row['HomeTeam']
            away_team = row['AwayTeam']

            if home_team in team_stats and away_team in team_stats:
                home = team_stats[home_team]
                away = team_stats[away_team]

                features.append([
                    home['home_win_pct'], away['away_win_pct'],
                    home['home_avg_goals'], away['away_avg_goals'],
                    home['home_avg_shots'], away['away_avg_shots'],
                    home['home_avg_shots_on_target'], away['away_avg_shots_on_target'],
                    home['home_avg_fouls'], away['away_avg_fouls']
                ])

                goal_labels.append([row['FullTimeHomeTeamGoals'], row['FullTimeAwayTeamGoals']])  
                shot_labels.append([row['HomeTeamShots'], row['AwayTeamShots']])
                shot_on_target_labels.append([row['HomeTeamShotsOnTarget'], row['AwayTeamShotsOnTarget']])
                foul_labels.append([row['HomeTeamFouls'], row['AwayTeamFouls']])
                
        X = np.array(features)
        y_goals = np.array(goal_labels)
        y_shots = np.array(shot_labels)
        y_shots_on_target = np.array(shot_on_target_labels)
        y_fouls = np.array(foul_labels)

        X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(X, y_goals, test_size=0.1, random_state=42)
        X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X, y_shots, test_size=0.1, random_state=42)
        X_train_st, X_test_st, y_train_st, y_test_st = train_test_split(X, y_shots_on_target, test_size=0.1, random_state=42)
        X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X, y_fouls, test_size=0.1, random_state=42)

        goal_regressor = DecisionTreeRegressor(random_state=42)
        goal_regressor.fit(X_train_g, y_train_g)

        shot_regressor = DecisionTreeRegressor(random_state=42)
        shot_regressor.fit(X_train_s, y_train_s)

        shot_on_target_regressor = DecisionTreeRegressor(random_state=42)
        shot_on_target_regressor.fit(X_train_st, y_train_st)

        foul_regressor = DecisionTreeRegressor(random_state=42)
        foul_regressor.fit(X_train_f, y_train_f)
        
        
        def predict_match(home_team, away_team):
            if home_team not in team_stats or away_team not in team_stats:
                return "Invalid teams!"

            home = team_stats[home_team]
            away = team_stats[away_team]

            input_features = np.array([[
                home['home_win_pct'], away['away_win_pct'],
                home['home_avg_goals'], away['away_avg_goals'],
                home['home_avg_shots'], away['away_avg_shots'],
                home['home_avg_shots_on_target'], away['away_avg_shots_on_target'],
                home['home_avg_fouls'], away['away_avg_fouls']
            ]]).reshape(1, -1)

            goal_pred = goal_regressor.predict(input_features)[0]  
            shot_pred = shot_regressor.predict(input_features)[0]
            shot_on_target_pred = shot_on_target_regressor.predict(input_features)[0]
            foul_pred = foul_regressor.predict(input_features)[0]

            home_goals, away_goals = int(round(goal_pred[0])), int(round(goal_pred[1]))
            home_shots, away_shots = int(round(shot_pred[0])), int(round(shot_pred[1]))
            home_shots_on_target, away_shots_on_target = int(round(shot_on_target_pred[0])), int(round(shot_on_target_pred[1]))
            home_fouls, away_fouls = int(round(foul_pred[0])), int(round(foul_pred[1]))

            if home_goals > away_goals:
                result_pred = "H"  
            elif away_goals > home_goals:
                result_pred = "A"  
            else:
                result_pred = "D"  

            return f"""
            Predicted Outcome: {result_pred}
            Expected Full-Time Goals: {home_team} {home_goals} - {away_goals} {away_team}
            Expected Shots: {home_team} {home_shots} - {away_shots} {away_team}
            Expected Shots on Target: {home_team} {home_shots_on_target} - {away_shots_on_target} {away_team}
            Expected Fouls: {home_team} {home_fouls} - {away_fouls} {away_team}
            """

        
        home_team = team1_entry.get()
        away_team = team2_entry.get()
        result = predict_match(home_team, away_team)
        result_label.config(text=result)


    input_window = tk.Toplevel()
    input_window.title("Final Prediction Input")
    tk.Label(input_window, text="Team 1:").grid(row=0, column=0, padx=5, pady=5)
    team1_entry = tk.Entry(input_window)
    team1_entry.grid(row=0, column=1, padx=5, pady=5)
    tk.Label(input_window, text="Team 2:").grid(row=1, column=0, padx=5, pady=5)
    team2_entry = tk.Entry(input_window)
    team2_entry.grid(row=1, column=1, padx=5, pady=5)
    result_label = tk.Label(input_window, text="", justify="left", anchor="w")
    result_label.grid(row=3, column=0, columnspan=2, sticky="w", padx=5, pady=5)
    
    tk.Button(input_window, text="Predict", command=predict).grid(row=2, columnspan=2, pady=10)


In [26]:
visualization_functions = {
    "Apriori & FP Growth": visualize_apriori_fp_growth,
    "Bayes": visualize_bayes,
    "ID3": visualize_id3,
    "CART": visualize_cart,
    "C4.5": visualize_c45,
    "KNN": visualize_knn,
    "K-Means": visualize_kmeans,
    "Agglomerative": visualize_agglomerative,
    "DBSCAN": visualize_dbscan,
    "Final Prediction using 2 Teams": visualize_final_prediction
}

In [27]:
def run_selected_method():
    method = run_selected_method.get()
    if method not in visualization_functions:
        messagebox.showerror("Error", "Please select a valid method from the dropdown.")
        return
    # Call the corresponding function
    visualization_functions[method]()

In [29]:
root = tk.Tk()
root.title("Data Analytics Visualizer")
root.geometry("400x250")
# Define available visualization functions
visualization_functions = {
    "Final Match Prediction": visualize_final_prediction
}

# Function to run selected visualization method
def run_selected_method():
    selected = selected_method.get()
    if selected in visualization_functions:
        visualization_functions[selected]()
    else:
        tk.messagebox.showerror("Error", "Please select a valid visualization method.")

# Label for dropdown
label = tk.Label(root, text="Select a Method to Visualize:")
label.pack(pady=10)

# Dropdown menu for selecting method
selected_method = tk.StringVar()
selected_method.set("Select a method")
dropdown = tk.OptionMenu(root, selected_method, *visualization_functions.keys())
dropdown.pack(pady=10)

# Button to run the selected method
run_button = tk.Button(root, text="Visualize", command=run_selected_method)
run_button.pack(pady=20)

# Start the tkinter main event loop
root.mainloop()