<a href="https://colab.research.google.com/github/souravsl/project/blob/main/simulator_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
df = pd.read_csv('training_data.csv')
df.head(5)

Unnamed: 0,Label,SMILES,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0,COC(=O)N(C)c1c(N)nc(nc1N)c2nn(Cc3ccccc3F)c4ncc...,1.821,1266.407,22.121,16.781,16.781,14.901,9.203,9.203,...,0,0,0,0,0,0,0,0,0,0
1,0,C[C@H](N(O)C(=O)N)c1cc2ccccc2s1,2.363,490.434,11.707,8.752,9.569,7.592,4.854,5.67,...,0,0,0,0,0,0,0,1,0,1
2,0,C[N+](C)(C)CC(=O)[O-],3.551,93.092,6.784,5.471,5.471,3.417,2.42,2.42,...,0,0,0,0,0,0,0,0,0,0
3,1,CC(C)n1c(\C=C\[C@H](O)C[C@H](O)CC(=O)O)c(c2ccc...,2.076,1053.003,21.836,16.995,16.995,14.274,9.926,9.926,...,0,0,0,0,0,0,0,0,0,0
4,1,C\C(=C(\C#N)/C(=O)Nc1ccc(cc1)C(F)(F)F)\O,2.888,549.823,14.629,9.746,9.746,8.752,5.04,5.04,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df["Label"].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,359
1,118


In [6]:
columns = df.columns
print(columns.value_counts)

<bound method IndexOpsMixin.value_counts of Index(['Label', 'SMILES', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v',
       'Chi1', 'Chi1n', 'Chi1v',
       ...
       'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene',
       'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene',
       'fr_unbrch_alkane', 'fr_urea'],
      dtype='object', length=198)>


In [9]:
rows = len(df)+1
print(rows)

478


In [13]:
class RowDeletionSimulator:
  def __init__(self,df,target_col):
    self.df = df.copy()
    self.target_col = target_col

  def stratified(self, delete_ratio=0.2):
      result = []
      for cls,group in self.df.groupby(self.target_col):
        keep_n = int(len(group)*(1-delete_ratio))
        result.append(group.sample(keep_n))
      return pd.concat(result).reset_index(drop=True)

  def skew(self, skew_class, target_ratio=None, delete_ratio=None):
        """
        Skew the dataset by removing rows from skew_class.

        You can use either:
        - target_ratio: desired minority:majority ratio (e.g., 1/4, 1/7)
        - delete_ratio: direct % to delete from skew_class
        """

        df = self.df.copy()
        cls_counts = df[self.target_col].value_counts()

        if skew_class not in cls_counts:
            raise ValueError(f"{skew_class} not found in dataset.")

        # Option 1: If delete_ratio is given → direct deletion
        if delete_ratio is not None:
            remove_n = int(cls_counts[skew_class] * delete_ratio)
            keep_n = cls_counts[skew_class] - remove_n

        # Option 2: If target_ratio is given → calculate needed deletions
        elif target_ratio is not None:
            majority_class = cls_counts.idxmax()
            majority_count = cls_counts.max()

            desired_minority = int(majority_count * target_ratio)
            keep_n = max(desired_minority, 1)  # avoid zero

        else:
            raise ValueError("Provide either target_ratio OR delete_ratio.")

        # Perform skewed deletion
        new_parts = []
        for cls, group in df.groupby(self.target_col):
            if cls == skew_class:
                keep_group = group.sample(keep_n, random_state=42)
            else:
                keep_group = group
            new_parts.append(keep_group)

        return pd.concat(new_parts).reset_index(drop=True)


  def generate_all(self, mild_ratio=1/4, high_ratio=1/7, skew_class=None):
        """
        Returns:
        Normal dataset
        Mild skew dataset
        High skew dataset
        """
        normal = self.df.copy()
        mild = self.skew(skew_class, target_ratio=mild_ratio)
        high = self.skew(skew_class, target_ratio=high_ratio)

        return normal, mild, high



In [14]:
sim = RowDeletionSimulator(df, target_col="Label")

In [21]:
strat_res=sim.stratified(delete_ratio=0.5)
skew_res = sim.skew(0,0.25)
normal, mild, high = sim.generate_all(
    mild_ratio=1/4,
    high_ratio=1/7,
    skew_class=1
)
print(strat_res)
print(skew_res)
print(mild)
print(high)


     Label                                             SMILES  BalabanJ  \
0        0    Nc1nc(CC(=O)Nc2ccc(CCNC[C@H](O)c3ccccc3)cc2)cs1     1.384   
1        0                                  CN(C)C(=N)N=C(N)N     3.928   
2        0  CC[C@@H]1\C=C(\C)/C[C@H](C)C[C@H](OC)[C@H]2O[C...     1.764   
3        0  CC1(C)O[C@@H]2C[C@@H]3[C@@H]4C[C@H](F)C5=CC(=O...     1.660   
4        0   Clc1ccc([C@@H]2CS\C(=C(\C#N)/n3ccnc3)\S2)c(Cl)c1     1.899   
..     ...                                                ...       ...   
233      1                       Nc1nnc(c(N)n1)c2cccc(Cl)c2Cl     2.633   
234      1                    COCCc1ccc(OC[C@H](O)CNC(C)C)cc1     2.286   
235      1   CCN[C@@H]1C[C@H](C)S(=O)(=O)c2sc(cc12)S(=O)(=O)N     2.645   
236      1  COc1ccc2nccc([C@@H](O)[C@@H]3C[C@@H]4CCN3C[C@@...     1.687   
237      1             CN1CCN(CCCN2c3ccccc3Sc4ccc(Cl)cc24)CC1     1.493   

      BertzCT    Chi0   Chi0n   Chi0v    Chi1   Chi1n   Chi1v  ...  \
0     880.936  19.769  15.468