In [1]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# ✅ Read parameters from JSON file
PARAMS_FILE = "train_params.json"
if os.path.exists(PARAMS_FILE):
    with open(PARAMS_FILE, "r") as f:
        params = json.load(f)
    dataset_path = params.get("dataset_path", "")
    selected_model = params.get("model", "kmeans").lower()
    hyperparameters = params.get("hyperparameters", {})
else:
    print("❌ Error: JSON parameters file not found.")
    exit(1)

# ✅ Ensure dataset exists
if not os.path.exists(dataset_path):
    print(f"❌ Error: Dataset file '{dataset_path}' not found.")
    exit(1)

# ✅ Load dataset
df = pd.read_excel(dataset_path, engine="openpyxl")

tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
X = tfidf.fit_transform(df['Description'])

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())
df['PCA1'] = X_pca[:, 0]
df['PCA2'] = X_pca[:, 1]

model = None
output_folder = "static/plots"
os.makedirs(output_folder, exist_ok=True)
os.makedirs("mlmodels", exist_ok=True)

if selected_model == "kmeans":
    print("🔹 Training K-Means...")
    n_clusters = int(hyperparameters.get("n_clusters", 5))
    model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    df['Cluster'] = model.fit_predict(X)
    model_filename = "mlmodels/kmeans_model.pkl"

    plt.figure(figsize=(6, 5))
    sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df, palette='viridis', s=100)
    plt.title(f'K-Means Clustering ({n_clusters} Clusters)')
    plt.savefig(f"{output_folder}/kmeans_plot.png")
    plt.close()

elif selected_model == "dbscan":
    print("🔹 Training DBSCAN...")
    eps = float(hyperparameters.get("eps", 0.5))
    min_samples = int(hyperparameters.get("min_samples", 5))
    model = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
    df['Cluster'] = model.fit_predict(X)
    model_filename = "mlmodels/dbscan_model.pkl"

    plt.figure(figsize=(6, 5))
    sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df, palette='viridis', s=100)
    plt.title(f'DBSCAN Clustering (eps={eps}, min_samples={min_samples})')
    plt.savefig(f"{output_folder}/dbscan_plot.png")
    plt.close()

elif selected_model == "hierarchical":
    print("🔹 Training Hierarchical Clustering...")
    n_clusters = int(hyperparameters.get("n_clusters", 5))
    model = AgglomerativeClustering(n_clusters=n_clusters)
    df['Cluster'] = model.fit_predict(X.toarray())
    model_filename = "mlmodels/hierarchical_model.pkl"

    plt.figure(figsize=(6, 5))
    sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df, palette='viridis', s=100)
    plt.title(f'Hierarchical Clustering ({n_clusters} Clusters)')
    plt.savefig(f"{output_folder}/hierarchical_plot.png")
    plt.close()

else:
    print(f"❌ Unknown model: {selected_model}")
    exit(1)

# ✅ Save Model
# ✅ Save the trained TF-IDF vectorizer
joblib.dump(tfidf, "mlmodels/tfidf_vectorizer.pkl")

joblib.dump(model, model_filename)
print(f"✅ Model '{selected_model}' trained and saved as '{model_filename}'")

# ✅ Save updated dataset
df.to_csv('clustered_dataset.csv', index=False)

print("✅ Training complete. Model and plots saved successfully.")


🔹 Training K-Means...


✅ Model 'kmeans' trained and saved as 'mlmodels/kmeans_model.pkl'
✅ Training complete. Model and plots saved successfully.
