# Parallization of Data Frame creation

A simple example on how to distribute the creation of new columns for a pandas data frame on multiple kernels

In [None]:
# imports
import pandas as pd
import multiprocessing
import numpy as np

In [None]:
# config
number_of_cores = multiprocessing.cpu_count()
 
# Create DataFrame
test_dataframe = pd.DataFrame(
    {'old_col_01': ['Jim', 'Pam', 'Micheal', 'Dwight'],
     'old_col_02': ['Halpert', 'Bisley', 'Scott', 'Shrute']})

In [None]:
# functions

# core functon: creates content for new columns
def create_new_content(row):
    print(row)
    new_content_col_1 = row.old_col_01.lower()
    new_content_col_2 = row.old_col_02.upper()
    
    return pd.Series([new_content_col_1, new_content_col_2])

# main function: adds multiple new columns to the dataframe, based on existing columns
def add_columns(dataframe):
    dataframe[['new_col_1', 'new_col_2']] = dataframe.apply(lambda row: create_new_content(row))
    
    return dataframe
    
# helper function: parallizes the process
def parallize_dataframe(dataframe, func, n_cores):
    df_split = np.array_split(dataframe, n_cores)
    pool = multiprocessing.Pool(n_cores)
    dataframe = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    
    return df

In [None]:
parallize_dataframe(test_dataframe, add_columns, number_of_cores)