# Goal 

Ways to apply transformations to columns (all and subset) in PySpark. 

### Purpose
Fundamental understanding and implementation

In [2]:
import math;

In [3]:
import findspark
findspark.init()

import pyspark;
from pyspark.ml.feature import *
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession
import pyspark.sql.functions as f
import functools

In [4]:
spark = SparkSession.builder.master('local').appName('playground').config('spark.driver.memory', '5G').getOrCreate()
spark.builder.config('spark.executor.memory', '16G')
spark.builder.config("spark.executor.cores", "4")

<pyspark.sql.session.SparkSession.Builder at 0x1118e4400>

## 1. Using Pythons functools REDUCE

In [8]:
# Delibrately simple to understand transformations quickly

raw_df = spark.createDataFrame(
    [
        (10, 100),
        (10000, 1000000),
        
    ],
    ["Column1", "Column2"] # column names
)

raw_df.show()

+-------+-------+
|Column1|Column2|
+-------+-------+
|     10|    100|
|  10000|1000000|
+-------+-------+



In [6]:
using_reduce_df = functools.reduce(
    
    # 1. function 
    lambda df, column_name: df.withColumn(column_name, #col name
                                              
                                              # Apply transformation in Spark here
                                              f.log10( 
                                                  
                                               f.col(column_name) # retrieve actual column object values
                                           
                                            )# T end
                                          ),
    # 2. thing you want to iterate over (iterable)
    raw_df.columns, 
    
    # 3. raw sequence data
    raw_df
)

using_reduce_df.show()

+-------+-------+
|Column1|Column2|
+-------+-------+
|    1.0|    2.0|
|    4.0|    6.0|
+-------+-------+



## 2. All columns using LIST COMPREHENSION

In [10]:
using_comp_df = raw_df.select(
    
    # select the new list of transformed columns from raw_df and put into separate DF
    
    [
      f.log10 (
          f.col(column_name)
      )  for column_name in raw_df.columns # list comprehension awesome sauce
    ]
)
using_comp_df.show()

+--------------+--------------+
|LOG10(Column1)|LOG10(Column2)|
+--------------+--------------+
|           1.0|           2.0|
|           4.0|           6.0|
+--------------+--------------+



But if you see the column names are a little cumbersome to work with later. (Think visualizations, further downstream transformations etc).  
So we can modify the above column a little bit. 

In [21]:
using_comp_df_2 = raw_df.select(
    
    [
      f.log10 (
          f.col(column_name)
          
      # we added some column name magic here    
      ).name(column_name)  
        #).name(column_name + "whatever U want")  # Try this for name formatting
        
        for column_name in raw_df.columns
    ]
)
using_comp_df_2.show()

+-------+-------+
|Column1|Column2|
+-------+-------+
|    1.0|    2.0|
|    4.0|    6.0|
+-------+-------+



In [22]:
spark.stop()