# Loopings

## Union

In [0]:
#-------------------- DATAFRAMES --------------------#
from random import choice

columns = ["id","label", "label_wrong", "value", "bool1", "bool2","bool3","bool4","bool5",]
data_list1, data_list2, data_list3 = [], [], []

for i in range(1,16):
    a = (i, f'label{i}', 'wrong', (i)*1000, choice([True, False]), choice([True, False]), choice([True, False]), choice([True, False]), choice([True, False]))
    if i < 6:
        data_list1.append(a)
    if 5 < i < 11:
        data_list2.append(a)
    if i > 10:
        data_list3.append(a)
    
df1 = spark.createDataFrame(data=data_list1,schema=columns)
df2 = spark.createDataFrame(data=data_list2,schema=columns)
df3 = spark.createDataFrame(data=data_list3,schema=columns)

#-------------------- UNION --------------------#
# libs
from pyspark.sql.functions import lit
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, BooleanType

# data
schema = StructType(
    [StructField("id", IntegerType(),True),
     StructField("dataframe", StringType(),True),
     StructField("label", StringType(),True),
     StructField("label_wrong", StringType(),True),
     StructField("value", IntegerType(),True),
     StructField("bool1", BooleanType(), True),
     StructField("bool2", BooleanType(), True),
     StructField("bool3", BooleanType(), True),
     StructField("bool4", BooleanType(),True),
     StructField("bool5", BooleanType(),True),
])

df = spark.createDataFrame(data=[], schema=schema)

dfs_names = ['df1', 'df2', 'df3'] 
dfs_list = [df1, df2, df3]

for name, dataframe in zip(dfs_names, dfs_list):
    df = df.unionByName(dataframe.withColumn('dataframe', lit(name)))

display(df)

id,dataframe,label,label_wrong,value,bool1,bool2,bool3,bool4,bool5
1,df1,label1,wrong,1000,False,False,True,True,True
2,df1,label2,wrong,2000,False,True,False,False,False
3,df1,label3,wrong,3000,True,False,True,True,True
4,df1,label4,wrong,4000,True,False,False,True,True
5,df1,label5,wrong,5000,False,False,False,False,True
6,df2,label6,wrong,6000,False,True,False,True,True
7,df2,label7,wrong,7000,False,True,True,True,True
8,df2,label8,wrong,8000,True,True,True,True,False
9,df2,label9,wrong,9000,True,True,True,True,True
10,df2,label10,wrong,10000,False,False,True,True,True


## Split dataframe (to dict)

In [0]:
#-------------------- SPLIT DATAFRAME | DICT --------------------#
# dictionary of dataframes
df_dict ={}
for dataframe in dfs_names:
    df_dict[dataframe] = df.filter(df.dataframe == dataframe)
    
print(df_dict.keys())

dict_keys(['df1', 'df2', 'df3'])


## Create list & add column

In [0]:
#-------------------- LIST TO COLUMN --------------------#
from pyspark.sql.functions import monotonically_increasing_id 

label_list = []
for i in range(df.count()):
    label_list.append((f'new_label{i+1}',))

df_label = (
    spark.createDataFrame(data = label_list, schema=['new_label'])
    .repartition(1)
    .withColumn("new_id", monotonically_increasing_id())
    .join(
        df.repartition(1).withColumn("new_id", monotonically_increasing_id()),
        on=['new_id'],
        how='inner'
    )
    .drop('new_id')
)
display(df_label)

new_label,id,dataframe,label,label_wrong,value,bool1,bool2,bool3,bool4,bool5
new_label1,1,df1,label1,wrong,1000,False,False,True,True,True
new_label2,2,df1,label2,wrong,2000,False,True,False,False,False
new_label3,3,df1,label3,wrong,3000,True,False,True,True,True
new_label4,4,df1,label4,wrong,4000,True,False,False,True,True
new_label5,5,df1,label5,wrong,5000,False,False,False,False,True
new_label6,6,df2,label6,wrong,6000,False,True,False,True,True
new_label7,7,df2,label7,wrong,7000,False,True,True,True,True
new_label8,8,df2,label8,wrong,8000,True,True,True,True,False
new_label9,9,df2,label9,wrong,9000,True,True,True,True,True
new_label10,10,df2,label10,wrong,10000,False,False,True,True,True


## Percentage

In [0]:
#-------------------- PERCENTAGE --------------------#
import pyspark.sql.functions as f 
from pyspark.sql.functions import col

df_perc = df_label
for column in df_perc.columns:
    if column  in ['value']:
        df_perc = (df_perc
                   .withColumn(f'{column}_p', f.round((col(column)/f.sum(column).over(Window.partitionBy()))*100, 2))
                  )
display(df_perc)

new_label,id,dataframe,label,label_wrong,value,bool1,bool2,bool3,bool4,bool5,value_p
new_label1,1,df1,label1,wrong,1000,False,False,True,True,True,0.83
new_label2,2,df1,label2,wrong,2000,False,True,False,False,False,1.67
new_label3,3,df1,label3,wrong,3000,True,False,True,True,True,2.5
new_label4,4,df1,label4,wrong,4000,True,False,False,True,True,3.33
new_label5,5,df1,label5,wrong,5000,False,False,False,False,True,4.17
new_label6,6,df2,label6,wrong,6000,False,True,False,True,True,5.0
new_label7,7,df2,label7,wrong,7000,False,True,True,True,True,5.83
new_label8,8,df2,label8,wrong,8000,True,True,True,True,False,6.67
new_label9,9,df2,label9,wrong,9000,True,True,True,True,True,7.5
new_label10,10,df2,label10,wrong,10000,False,False,True,True,True,8.33


## Col names: replace with map
Useful to transformation process with prefix and suffix

In [0]:
#-------------------- PERCENTAGE --------------------#
list_label = map(str, [x for x in df_label.columns if x not in ['new_label', 'label_wrong']])
list_label_replace = list(map(lambda x: x.replace('label', 'tag'), list_label))
print(list_label_replace) 

['id', 'dataframe', 'tag', 'value', 'bool1', 'bool2', 'bool3', 'bool4', 'bool5']
