# Introduction

Documentation [here](https://joeltanwr.github.io/DirtyDF/html/)

In [1]:
import sys
sys.path.insert(0, "../ddf/")

# Stainers

In [3]:
import importlib

In [5]:
import pandas as pd
import numpy as np

In [7]:
import stainer as SE

In [6]:
importlib.reload(SE)

In [8]:
df = pd.DataFrame([(0, 'Cat'), (1, 'Dog'), (2, 'Rabbit'), (3, 'Cat'), (4, 'Cat'), (5, 'Dog')], columns=('id', 'class'))

In [9]:
df

Unnamed: 0,id,class
0,0,Cat
1,1,Dog
2,2,Rabbit
3,3,Cat
4,4,Cat
5,5,Dog


In [13]:
shuffle_stainer = SE.ShuffleStainer()
rng = np.random.default_rng(42)

In [14]:
new_df, row_map, col_map = shuffle_stainer.transform(df, rng)
new_df

Unnamed: 0,id,class
0,3,Cat
1,2,Rabbit
2,5,Dog
3,4,Cat
4,1,Dog
5,0,Cat


In [15]:
dupli_stainer = SE.RowDuplicateStainer(deg=0.3)

In [16]:
new_df, row_map, col_map= dupli_stainer.transform(df, rng, row_idx=list(df.index))

In [17]:
new_df

Unnamed: 0,id,class
0,0,Cat
1,0,Cat
2,1,Dog
3,2,Rabbit
4,3,Cat
5,4,Cat
6,5,Dog


In [18]:
rng = np.random.default_rng(42) # reinitialize random generator
df2 = pd.DataFrame(zip(range(100), rng.choice(['Cat','Dog','Rabbit'], 100), rng.choice(['Cow', 'Sheep', 'Goat', 'Horse'], 100)),
                  columns=('id', 'class', 'class2'))


In [19]:
inflection_stainer=SE.InflectionStainer()

In [20]:
new_df2, row_map2, col_map2 = inflection_stainer.transform(df2, rng)
new_df2.head()

Unnamed: 0,id,class,class2
0,0,Cat,Horse
1,1,Rabbit,Cow
2,2,Dog,Horse
3,3,Dog,Cow
4,4,Dog,Horse


In [21]:
df=pd.DataFrame(zip(range(1, 6), np.random.choice(range(1, 101), 5, replace=True), 
                    np.random.choice(range(101, 200), 5, replace=True)),
               columns=('id', 'class1', 'class2'))
df

Unnamed: 0,id,class1,class2
0,1,95,133
1,2,11,103
2,3,39,178
3,4,11,120
4,5,7,166


In [22]:
numer_stainer=SE.FTransformStainer(deg=1)

In [23]:
new_df, a, b=numer_stainer.transform(df, rng, row_idx=[], col_idx=[1,2])

In [24]:
numer_stainer.get_history()

('Function Transform',
 'Converted column class1 with transformation cubert. \n Converted column class2 with transformation inverse. \n ',
 0.0025861263275146484)

In [25]:
null_stainer=SE.NullifyStainer(deg=0.5)

In [26]:
new_df, a, b = null_stainer.transform(df, rng, row_idx=[0,1,2,3], col_idx=[0,1,2])

In [27]:
new_df

Unnamed: 0,id,class1,class2
0,1.0,95.0,
1,,11.0,103.0
2,,39.0,178.0
3,,,
4,5.0,7.0,166.0


In [28]:
null_stainer.get_history()

('Nullify',
 'Replaced 6 values to become empty in specificed rows/cols.',
 0.001814126968383789)

In [29]:
bin_stainer=SE.BinningStainer()

In [30]:
new_df, a, b=bin_stainer.transform(df, rng, col_idx=[2])

In [31]:
cat_df = df2.copy()
cat_df['class']=cat_df['class']+" "+cat_df['class2']
cat_df

Unnamed: 0,id,class,class2
0,0,Cat Horse,Horse
1,1,Rabbit Cow,Cow
2,2,Dog Horse,Horse
3,3,Dog Cow,Cow
4,4,Dog Horse,Horse
...,...,...,...
95,95,Cat Cow,Cow
96,96,Cat Sheep,Sheep
97,97,Rabbit Cow,Cow
98,98,Dog Sheep,Sheep


In [32]:
col_stain=SE.ColumnSplitter(col_idx=[1])

In [143]:
cat_df.iloc[:, 1].str.split(" ")

0      [Cat, Horse]
1     [Rabbit, Cow]
2      [Dog, Horse]
3        [Dog, Cow]
4      [Dog, Horse]
          ...      
95       [Cat, Cow]
96     [Cat, Sheep]
97    [Rabbit, Cow]
98     [Dog, Sheep]
99     [Cat, Horse]
Name: class, Length: 100, dtype: object

# Dirty DF

In [5]:
import DirtyDF as ddf

In [15]:
animal = pd.DataFrame([(0, 'Cat'), (1, 'Dog'), (2, 'Rabbit'), (3, 'Cat'), (4, 'Cat'), (5, 'Dog')],
                  columns=('id', 'class'))
animal['numeric']=np.random.choice(range(1, 101), 6)
animal["class"] = animal["class"].astype("category")

In [21]:
animal.head(1)

Unnamed: 0,id,class,numeric
0,0,Cat,9


In [19]:
animal_ddf = ddf.DirtyDF(animal, seed = 123)

In [44]:
dupli_stainer=SE.RowDuplicateStainer(deg=0.5)

In [48]:
dupli_stainer.get_col_type()

'all'

In [43]:
new_df, a, b=dupli_stainer.transform(animal, rng)
new_df.shape

(9, 3)

In [54]:
# To shuffle rows of the df. No need to specify which rows or columns
shuffle_stainer=SE.ShuffleStainer() 
# Duplicate certain rows. Set it to duplicate 50% of the dataframe rows. Eligible on all columns
dupli_stainer=SE.RowDuplicateStainer(deg=0.5)
# Alter string columns
inflect_stainer=SE.InflectionStainer()

In [55]:
animal_ddf_mult=animal_ddf.add_stainers([shuffle_stainer, dupli_stainer, inflect_stainer])

In [56]:
animal_ddf_mult.summarise_stainers()

1. Shuffle
2. Add Duplicates
3. Inflection


In [57]:
animal_ddf_mult_out = animal_ddf_mult.run_all_stainers() #does the same as above

In [58]:
animal_ddf_mult_out.print_history()

1. Shuffle 
 Order of rows randomized 
 Time taken: 0.0016717910766601562 

2. Add Duplicates 
 Added Duplicate Rows for 3 rows. 
  Each duplicated row should appear a maximum of 2 times. 
  Rows added: 3 
 Time taken: 0.0026259422302246094 

3. Inflection 
 Categorical inflections on:
{'class': {'Cat': ['cat', 'CAT', 'Cat', 'Cats'], 'Rabbit': ['rabbit', 'Rabbits', 'RABBIT', 'Rabbit'], 'Dog': ['Dog', 'Dogs', 'dog', 'DOG']}} 
 Time taken: 0.022428274154663086 



In [59]:
animal_ddf_mult_out.get_df()

Unnamed: 0,id,class,numeric
0,4,Cats,2
1,0,CAT,9
2,0,Cats,9
3,2,Rabbits,75
4,2,Rabbit,75
5,3,CAT,2
6,1,DOG,55
7,1,DOG,55
8,5,DOG,81


In [60]:
animal_ddf.get_df()

Unnamed: 0,id,class,numeric
0,0,Cat,9
1,1,Dog,55
2,2,Rabbit,75
3,3,Cat,2
4,4,Cat,2
5,5,Dog,81
