---

In [1]:
import pandas as pd
import seaborn as sns
from IPython.display import display, HTML
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# for parallel application
from joblib import Parallel, delayed
import multiprocessing

In [2]:
# loading iris data 
iris_df = sns.load_dataset('iris')

Find more datasets here **https://github.com/mwaskom/seaborn-data)**

In [3]:
# repeating DF 100 times
iris_df = pd.concat([iris_df]*50000, ignore_index=True)

In [4]:
iris_df.shape
iris_df['species'].value_counts()
iris_df.head()

(7500000, 5)

virginica     2500000
versicolor    2500000
setosa        2500000
Name: species, dtype: int64

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


**7.5 M records should do the trick**

----
define custom function to create an arbitrary feature : **feature1** = (petal_length * petal_width * mean_sepal_width(of each species))

In [5]:
def feature_creator(df):
    
    # create a copy of group df to avoid erronous results
    temp_df = df.copy()
    
    # feature definition
    temp_df['feature1'] = temp_df['petal_length'] * temp_df['petal_width'] * temp_df['sepal_width'].mean()
    
    print(temp_df['sepal_width'].mean())
    
    return pd.DataFrame(temp_df)

---

In [8]:
# define function to apply function in parallel to each group
def applyParallel(dfGrouped, func):
    retLst = Parallel(n_jobs=multiprocessing.cpu_count()-1)(delayed(func)(group) for name, group in dfGrouped)
    return pd.concat(retLst)

In [9]:
# Regular Apply
iris_final = iris_df.groupby('species').apply(lambda x : feature_creator(x))

3.4280000000203206
2.7700000000107075
2.973999999985015


In [None]:
# Parallel Apply - would observe for very dense custom function over a large number of groups
iris_final = applyParallel(iris_df.groupby('species'), feature_creator)

----