In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [11]:
unsupervisedmodels_comparison = pd.read_excel("Expanded_ST_R_With_GAN.xlsx")

In [12]:
X = unsupervisedmodels_comparison.drop(columns=['months_until_stockout'], errors='ignore')

In [13]:
X = X.select_dtypes(include=[np.number])

In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
models = {
    'KMeans (k=3)': KMeans(n_clusters=3, random_state=42),
    'Agglomerative Clustering': AgglomerativeClustering(n_clusters=3),
    'DBSCAN': DBSCAN(eps=0.5, min_samples=5)
}

In [16]:
results = []
for name, model in models.items():
    try:
        labels = model.fit_predict(X_scaled)
        if len(set(labels)) > 1 and len(set(labels)) < len(X):  
            silhouette = silhouette_score(X_scaled, labels)
            dbi = davies_bouldin_score(X_scaled, labels)
        else:
            silhouette = -1  
            dbi = np.inf
    except Exception as e:
        silhouette = -1
        dbi = np.inf
    
    results.append({
        'Model': name,
        'Silhouette Score': silhouette,
        'Davies-Bouldin Index': dbi
    })



In [17]:
unsupervisedmodel_results = pd.DataFrame(results).sort_values(by='Silhouette Score', ascending=False)

print("\nðŸ“Š Unsupervised Model Comparison (Higher Silhouette, Lower DBI is Better):")
print(unsupervisedmodel_results)

top_2 = unsupervisedmodel_results.head(2)
print(f"\nâœ… Top 2 Recommended Unsupervised Models:")
for i, row in top_2.iterrows():
    print(f"   {row['Model']} â†’ Silhouette: {row['Silhouette Score']:.3f} | DBI: {row['Davies-Bouldin Index']:.3f}")


ðŸ“Š Unsupervised Model Comparison (Higher Silhouette, Lower DBI is Better):
                      Model  Silhouette Score  Davies-Bouldin Index
0              KMeans (k=3)          0.143395              1.677539
1  Agglomerative Clustering          0.128226              2.115917
2                    DBSCAN         -1.000000                   inf

âœ… Top 2 Recommended Unsupervised Models:
   KMeans (k=3) â†’ Silhouette: 0.143 | DBI: 1.678
   Agglomerative Clustering â†’ Silhouette: 0.128 | DBI: 2.116


In [3]:
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [18]:
risk_model_df = pd.read_excel("Expanded_ST_R_With_GAN.xlsx")

In [19]:
num_risk_samples = 100
safe_samples = risk_model_df[risk_model_df['Left Stock'] >= risk_model_df['Safety Stock']]

In [20]:
if len(safe_samples) < num_risk_samples:
    synthetic_base = resample(safe_samples, n_samples=num_risk_samples, replace=True, random_state=42)
else:
    synthetic_base = safe_samples.sample(n=num_risk_samples, random_state=42)

In [21]:
synthetic_risk_samples = synthetic_base.copy()
synthetic_risk_samples['Left Stock'] = synthetic_risk_samples['Safety Stock'] * 0.5
synthetic_risk_samples['Stock_Risk_Flag'] = 1

In [22]:
risk_model_df = pd.concat([risk_model_df, synthetic_risk_samples], ignore_index=True)

print("Updated class distribution:")
print(risk_model_df['Stock_Risk_Flag'].value_counts())

Updated class distribution:
Stock_Risk_Flag
0    297
1    100
Name: count, dtype: int64


In [23]:
X = risk_model_df.drop(columns=['Stock_Risk_Flag'])
y = risk_model_df['Stock_Risk_Flag']
X = X.select_dtypes(include=[np.number])  # use only numeric columns

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [25]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [26]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

In [27]:
results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1 Score': f1_score(y_test, y_pred, zero_division=0)
    })

In [28]:
model_performance_df = pd.DataFrame(results).sort_values(by='F1 Score', ascending=False)

print("\nðŸ“Š Supervised Learning Model Performance Comparison:")
print(model_performance_df)

top_model = model_performance_df.iloc[0]
print(f"\nâœ… Top Recommended Classification Model:")
print(f"   {top_model['Model']} â†’ Accuracy: {top_model['Accuracy']:.2f} | Precision: {top_model['Precision']:.2f} | Recall: {top_model['Recall']:.2f} | F1 Score: {top_model['F1 Score']:.2f}")


ðŸ“Š Supervised Learning Model Performance Comparison:
                 Model  Accuracy  Precision  Recall  F1 Score
3    Gradient Boosting     0.875   0.916667    0.55  0.687500
1        Decision Tree     0.825   0.687500    0.55  0.611111
0  Logistic Regression     0.825   1.000000    0.30  0.461538
2        Random Forest     0.750   0.500000    0.15  0.230769

âœ… Top Recommended Classification Model:
   Gradient Boosting â†’ Accuracy: 0.88 | Precision: 0.92 | Recall: 0.55 | F1 Score: 0.69


In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [30]:
expiry_loss_df = pd.read_excel("Expanded_ST_R_With_GAN.xlsx")

In [35]:
target_col = 'expiry_loss_flag'
drop_cols = [target_col, 'Opportunity lost from expired drugs', 'Drug_Name_Label', 'Category_Expired']

In [36]:
X = expiry_loss_df.drop(columns=drop_cols, errors='ignore').dropna()
y = expiry_loss_df.loc[X.index, target_col]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [38]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [39]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results.append({
        "Model": name,
        "Accuracy": round(accuracy_score(y_test, y_pred), 4),
        "Precision": round(precision_score(y_test, y_pred, zero_division=0), 6),
        "Recall": round(recall_score(y_test, y_pred, zero_division=0), 2),
        "F1 Score": round(f1_score(y_test, y_pred, zero_division=0), 6)
    })

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [57]:
results_df = pd.DataFrame(results).sort_values(by="F1 Score", ascending=False).reset_index(drop=True)

display_df = results_df.copy()
display_df[['Accuracy', 'Precision', 'Recall', 'F1 Score']] = display_df[['Accuracy', 'Precision', 'Recall', 'F1 Score']].round(3)

print("\nðŸ“Š Supervised Learning Model Performance Comparison:")
print(display_df.to_string(index=True))

top_model = display_df.iloc[0]
print("\nâœ… Top Recommended Classification Model:")
print(f"   {top_model['Model']} â†’ Accuracy: {top_model['Accuracy']:.3f} | "
      f"Precision: {top_model['Precision']:.3f} | Recall: {top_model['Recall']:.3f} | "
      f"F1 Score: {top_model['F1 Score']:.3f}")


ðŸ“Š Supervised Learning Model Performance Comparison:
                 Model  Accuracy  Precision  Recall  F1 Score
0        Random Forest      0.80      0.844    0.73     0.783
1    Gradient Boosting      0.76      0.771    0.73     0.750
2  Logistic Regression      0.76      0.788    0.70     0.743
3        Decision Tree      0.72      0.700    0.76     0.727

âœ… Top Recommended Classification Model:
   Random Forest â†’ Accuracy: 0.800 | Precision: 0.844 | Recall: 0.730 | F1 Score: 0.783


In [41]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [42]:
regression_data = pd.read_excel("Expanded_ST_R_With_GAN.xlsx")

In [43]:
X = regression_data.drop(columns=['expiry_loss_flag'])
y = regression_data['expiry_loss_flag']

In [44]:
X = X.select_dtypes(include=[np.number])

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [47]:
regression_models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor()
}

In [48]:
regression_results = []
for name, model in regression_models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    regression_results.append({
        'Model': name,
        'MAE': mean_absolute_error(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R2 Score': r2_score(y_test, y_pred)
    })

In [56]:
regression_results_df = pd.DataFrame(regression_results).sort_values(by='R2 Score', ascending=False).reset_index(drop=True)

display_df = regression_results_df.copy()
display_df[['MAE', 'MSE', 'RMSE', 'R2 Score']] = display_df[['MAE', 'MSE', 'RMSE', 'R2 Score']].round(3)

print("\nðŸ“ˆ Supervised Regression Model Performance Comparison:")
print(display_df.to_string(index=True))

top_regressor = display_df.iloc[0]
print(f"\nâœ… Top Recommended Regression Model:")
print(f"   {top_regressor['Model']} â†’ RÂ²: {top_regressor['R2 Score']:.3f} | MAE: {top_regressor['MAE']:.3f} | RMSE: {top_regressor['RMSE']:.3f}")


ðŸ“ˆ Supervised Regression Model Performance Comparison:
                         Model    MAE    MSE   RMSE  R2 Score
0      Random Forest Regressor  0.218  0.095  0.309     0.612
1  Gradient Boosting Regressor  0.224  0.113  0.336     0.542
2            Linear Regression  0.261  0.140  0.374     0.431
3      Decision Tree Regressor  0.183  0.183  0.428     0.253

âœ… Top Recommended Regression Model:
   Random Forest Regressor â†’ RÂ²: 0.612 | MAE: 0.218 | RMSE: 0.309
