In [None]:
import os
import ast
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib import rcParams
from imblearn.over_sampling import SMOTE, SVMSMOTE, ADASYN, RandomOverSampler,BorderlineSMOTE, KMeansSMOTE
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
rcParams['font.family'] = 'serif'
rcParams['font.serif'] = 'Times New Roman'

In [None]:
data = pd.read_excel('imputed_d_xgbf_1000iteration.xlsx').drop(
    ['Extrusion_Rate_Lengthwise_(mm/s)', 'Extrusion_Rate_Volume-wise_(mL/s)', 'chamber Temperature  (°C)'], axis=1)
data['Cell_Density_(cells/mL)'] = data['Cell_Density_(cells/mL)'] / 1e6

In [None]:
df = data
targetless = df.drop(['Printability', 'Cell Response', 'Scaffold Quality (P*C)'], axis=1)

In [None]:
class_counts = pd.DataFrame(columns=['Target', 'Class', 'Count Before Over Sampeling', 'Count After Over Sampeling'])


scaler = StandardScaler()
pca = PCA(n_components=2)
plt.figure(figsize=(42, 24))  

targets = ['Printability', 'Cell Response', 'Scaffold Quality (P*C)']

for i, target in enumerate(targets, 1):
    X = scaler.fit_transform(targetless)
    y = df[target]
    classes_before = y.value_counts().reset_index()
    classes_before.columns = ['Class', 'Count Before Over Sampeling']
    X_pca = pca.fit_transform(X)
    
    plt.subplot(2, 3, i)
    classes = np.unique(y)
    for cls in classes:
        ix = np.where(y == cls)
        plt.scatter(X_pca[ix, 0], X_pca[ix, 1], label=f'Class {cls}', alpha=0.5, s=100)
    plt.title(f'{target} Data Distribution Before Over Sampeling', fontsize=30, fontweight='bold')
    plt.xlabel('PCA Component 1', fontsize=20)
    plt.ylabel('PCA Component 2', fontsize=20)
    plt.xticks(fontsize=18)  
    plt.yticks(fontsize=18)  
    plt.legend(title='Class', fontsize=16)
    
    if target == 'Scaffold Quality (P*C)':
        smote = SMOTE(random_state=42, k_neighbors=2)
    else:
        smote = SMOTE(random_state=42)
        
    X_resampled, y_resampled = smote.fit_resample(targetless, y)
    X_resampled_pca = pca.transform(scaler.fit_transform(X_resampled))
    
   
    y_resampled_df = pd.Series(y_resampled, name=target)

    resampled_df = pd.concat([X_resampled, y_resampled_df], axis=1)

    file_name = f"{target.replace(' ', '_').replace('*', 'x')}_resampled_df.csv"

    resampled_df.to_csv(f"smote/{file_name}", index=False)
    print(f"Saved resampled dataset for {target} as {file_name}")
    
    
    classes_after = pd.Series(y_resampled).value_counts().reset_index()
    classes_after.columns = ['Class', 'Count After Over Sampeling']
    
    class_count = pd.merge(classes_before, classes_after, on='Class', how='outer')
    class_count['Target'] = target  
    class_counts = pd.concat([class_counts, class_count], ignore_index=True)
    
    
    plt.subplot(2, 3, i + 3)  
    classes_resampled = np.unique(y_resampled)
    for cls in classes_resampled:
        ix = np.where(y_resampled == cls)
        plt.scatter(X_resampled_pca[ix, 0], X_resampled_pca[ix, 1], label=f'Class {cls}', alpha=0.5, s=100)
    plt.title(f'{target} Data Distribution After Over Sampeling', fontsize=30, fontweight='bold')
    plt.xlabel('PCA Component 1', fontsize=20)
    plt.ylabel('PCA Component 2', fontsize=20)
    plt.xticks(fontsize=18)  
    plt.yticks(fontsize=18)
    plt.legend(title='Class', fontsize=16)

plt.tight_layout()
plt.show()