In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import os
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
os.environ['OMP_NUM_THREADS'] = '4'
warnings.filterwarnings("ignore", message="KMeans is known to have a memory leak")


# Load the dataset
df = pd.read_csv("Ramcocements.csv")

# View first 5 rows
print(df.head())

# Check structure
print(df.info())
df.columns = df.columns.str.strip()
# Check missing values
print(df.isnull().sum())
features = [
    "Sales_Revenue",
    "Units_Sold",
    "Profit_Margin_Percent",
    "Discount_Percent",
    "Return_Rate_Percent",
    "Customer_Rating"
]

X = df[features]
#EDA
df.shape
df.columns
# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
df["Cluster"] = kmeans.fit_predict(X_scaled)
# Evaluation
print("Silhouette Score:", silhouette_score(X_scaled, df["Cluster"]))
# Visualization
plt.figure(figsize=(8,6))
sns.scatterplot(
    x=df["Units_Sold"],
    y=df["Sales_Revenue"],
    hue=df["Cluster"],
    palette="Set1"
)
plt.show()
#KDE Plot
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
n_cols = 2
n_rows = math.ceil(len(numeric_cols) / n_cols)

plt.figure(figsize=(12, 5 * n_rows))

for i, col in enumerate(numeric_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.kdeplot(data=df, x=col, fill=True)
    plt.title(f"KDE Plot - {col}")
    plt.xlabel(col)
    plt.ylabel("Density")

plt.tight_layout()
plt.show()
plt.figure(figsize=(12,8))

numeric_df = df.select_dtypes(include=['int64', 'float64'])

sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap - Numerical Features")
plt.show()
#PCA
pca = PCA(n_components=2)

principal_components = pca.fit_transform(X_scaled)

pca_df = pd.DataFrame(
    data=principal_components,
    columns=["PCA1", "PCA2"]
)

pca_df.head()
plt.figure(figsize=(8,6))

scatter = plt.scatter(
    pca_df["PCA1"],
    pca_df["PCA2"],
    c=pca_df["PCA1"],
    cmap="viridis",
    s=70
)

plt.colorbar(scatter)
plt.title("PCA Projection (Color by PCA1 Intensity)")
plt.show()
intertia = []
range_val = range(1,15)
for i in range_val:
    KMean = KMeans(n_clusters=i)
    KMean.fit_predict(pd.DataFrame(X_scaled))
    intertia.append(KMean.inertia_)
plt.plot(range_val,intertia,'bx-')
plt.xlabel('Values of K')  # Corrected from xlable to xlabel
plt.ylabel('Intertia')     # Corrected from ylable to ylabel
plt.title('The Elbow Method using Intertia')
plt.show()
# Import necessary libraries
from sklearn.cluster import KMeans
import pandas as pd

# Assuming X_scaled and pca_df are already defined
# Create and fit the KMeans model
KMeans_model = KMeans(n_clusters=4)
KMeans_model.fit(X_scaled)  
# Add cluster labels to the dataframe
pca_df_KMeans = pd.concat([pca_df, pd.DataFrame({'cluster': KMeans_model.labels_})], axis=1)
plt.figure(figsize=(8,8))

# First add the Cluster column to the dataframe
pca_df_KMeans["Cluster"] = df["Cluster"]

# Then use the scatterplot with the Cluster column
sns.scatterplot(
    x="PCA1",
    y="PCA2",
    hue="Cluster",          
    data=pca_df_KMeans,     
    palette=["red", "green", "blue"], 
    s=80
)

plt.title("Clustering using K-Means Algorithm")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.grid(True)
plt.show()
n_clusters = 3

# Train KMeans
kmeans_model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df["Cluster"] = kmeans_model.fit_predict(X_scaled)
# Get cluster centers (scaled)
cluster_centers_scaled = kmeans_model.cluster_centers_

# Inverse transform to original values
cluster_centers_original = scaler.inverse_transform(cluster_centers_scaled)

# Convert to DataFrame
cluster_centers = pd.DataFrame(
    data=cluster_centers_original,
    columns=features
)

cluster_centers
cluster_centers.index.name = "Cluster"
cluster_centers
cluster_df = pd.concat(
    [df.reset_index(drop=True),
     pd.DataFrame({"Cluster": kmeans_model.labels_})],
    axis=1
)

cluster_df.head()
print(type(cluster_df))
print(cluster_df.shape)
print(cluster_df.head())
# Add cluster column directly (Best Practice)
df["Cluster"] = kmeans_model.labels_

cluster_df = df.copy()
numeric_cols = [
    "Sales_Revenue",
    "Units_Sold",
    "Profit_Margin_Percent",
    "Discount_Percent",
    "Return_Rate_Percent",
    "Customer_Rating"
]

for col in numeric_cols:
    
    g = sns.FacetGrid(cluster_df, col="Cluster", height=4)
    g.map_dataframe(sns.histplot, x=col, bins=20)
    
    g.fig.suptitle(f"Distribution of {col} by Cluster", y=1.02)
    
    plt.tight_layout()
    plt.show()
    from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X = cluster_df[features]
y = cluster_df["Cluster"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

X_train
# ------------------------------
# Decision Tree Model
# ------------------------------

from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

# Train model using entropy
dt_model = DecisionTreeClassifier(criterion="entropy", random_state=42)

dt_model.fit(X_train, y_train)

# Prediction
y_pred = dt_model.predict(X_test)
# ------------------------------
# Confusion Matrix
# ------------------------------

print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
import joblib

# Save model
joblib.dump(dt_model, "ramco_decision_tree.pkl")

# Load model
loaded_model = joblib.load("ramco_decision_tree.pkl")

print("Accuracy:", loaded_model.score(X_test, y_test) * 100, "%")
# ----------------------------------
# Saving All Trained Models & Data
# ----------------------------------

import joblib

# Save KMeans model
joblib.dump(kmeans_model, "ramco_kmeans_model.pkl")

# Save Scaler
joblib.dump(scaler, "ramco_scaler.pkl")

# Save Decision Tree model
joblib.dump(dt_model, "ramco_decision_tree_model.pkl")

# Save Clustered Dataset
cluster_df.to_csv("Ramco_Product_Segmentation_Data.csv", index=False)

print("All models and clustered dataset saved successfully.")
joblib.dump(kmeans_model, "ramco_kmeans_v1.pkl")
joblib.dump(scaler, "ramco_scaler_v1.pkl")
joblib.dump(dt_model, "ramco_decision_tree_v1.pkl")
import os
import joblib

# Create folder if it doesn't exist
os.makedirs("models", exist_ok=True)

# Save models inside folder
joblib.dump(kmeans_model, "models/ramco_kmeans_v1.pkl")
joblib.dump(scaler, "models/ramco_scaler_v1.pkl")
joblib.dump(dt_model, "models/ramco_decision_tree_v1.pkl")

print("Models saved successfully in 'models' folder.")