# Parallization of Data Frame creation

A simple example on how to distribute the creation of new columns for a pandas data frame on multiple kernels

In [36]:
# imports
import pandas as pd
import multiprocessing
import numpy as np

In [40]:
# config
number_of_cores = multiprocessing.cpu_count()
 
# Create DataFrame
test_dataframe = pd.DataFrame(
    {'old_col_01': ['Jim', 'Pam', 'Micheal', 'Dwight'],
     'old_col_02': ['Halpert', 'Bisley', 'Scott', 'Shrute']
    })

In [41]:
# functions

# core functon: creates content for new columns
def create_new_content(row):
    new_content_col_1 = row.old_col_01.lower()
    new_content_col_2 = row.old_col_02.upper()
    
    return pd.Series([new_content_col_1, new_content_col_2])

# main function: adds multiple new columns to the dataframe, based on existing columns
def add_columns(dataframe):
    dataframe[['new_col_01', 'new_col_02']] = dataframe.apply(lambda row: create_new_content(row), axis=1)
    
    return dataframe
    
# helper function: parallizes the process
def parallize_dataframe(dataframe, func, n_cores):
    df_split = np.array_split(dataframe, n_cores)
    pool = multiprocessing.Pool(n_cores)
    
    dataframe = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    
    return dataframe

In [42]:
df = parallize_dataframe(test_dataframe, add_columns, number_of_cores)

df

Unnamed: 0,old_col_01,old_col_02,new_col_01,new_col_02
0,Jim,Halpert,jim,HALPERT
1,Pam,Bisley,pam,BISLEY
2,Micheal,Scott,micheal,SCOTT
3,Dwight,Shrute,dwight,SHRUTE
