### Objective: Extract Summarized Date features of Bosch assembly line

In [1]:
#Import Packages: 

from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
from pyspark.sql.types import *
import zipfile
import pandas as pd
import numpy as np
import vaex
from pyspark.sql import functions as F
import re

In [2]:
#Create Spark sesison:

spark = SparkSession\
        .builder\
        .master("local[*]")\
        .appName('Spark Project')\
        .getOrCreate()
spark

In [3]:
#Read csv files: 
date = spark.read.csv('../data/train_date.csv', header = True)

In [4]:
#Create Temporary View to run SQl commands:
date.createOrReplaceTempView('date')

In [5]:
date.limit(10).toPandas().iloc[:, 1:20]

Unnamed: 0,L0_S0_D1,L0_S0_D3,L0_S0_D5,L0_S0_D7,L0_S0_D9,L0_S0_D11,L0_S0_D13,L0_S0_D15,L0_S0_D17,L0_S0_D19,L0_S0_D21,L0_S0_D23,L0_S1_D26,L0_S1_D30,L0_S2_D34,L0_S2_D38,L0_S2_D42,L0_S2_D46,L0_S2_D50
0,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24
1,,,,,,,,,,,,,,,,,,,
2,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7
3,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.21,1149.21,1149.21,1149.21,1149.21
4,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,,,,,
5,1331.66,1331.66,1331.66,1331.66,1331.66,1331.66,1331.66,1331.66,1331.66,1331.66,1331.66,1331.66,1331.66,1331.66,,,,,
6,,,,,,,,,,,,,,,,,,,
7,,,,,,,,,,,,,,,,,,,
8,517.64,517.64,517.64,517.64,517.64,517.64,517.64,517.64,517.64,517.64,517.64,517.64,517.64,517.64,517.64,517.64,517.64,517.64,517.64
9,,,,,,,,,,,,,,,,,,,


In [6]:
date.printSchema()

root
 |-- Id: string (nullable = true)
 |-- L0_S0_D1: string (nullable = true)
 |-- L0_S0_D3: string (nullable = true)
 |-- L0_S0_D5: string (nullable = true)
 |-- L0_S0_D7: string (nullable = true)
 |-- L0_S0_D9: string (nullable = true)
 |-- L0_S0_D11: string (nullable = true)
 |-- L0_S0_D13: string (nullable = true)
 |-- L0_S0_D15: string (nullable = true)
 |-- L0_S0_D17: string (nullable = true)
 |-- L0_S0_D19: string (nullable = true)
 |-- L0_S0_D21: string (nullable = true)
 |-- L0_S0_D23: string (nullable = true)
 |-- L0_S1_D26: string (nullable = true)
 |-- L0_S1_D30: string (nullable = true)
 |-- L0_S2_D34: string (nullable = true)
 |-- L0_S2_D38: string (nullable = true)
 |-- L0_S2_D42: string (nullable = true)
 |-- L0_S2_D46: string (nullable = true)
 |-- L0_S2_D50: string (nullable = true)
 |-- L0_S2_D54: string (nullable = true)
 |-- L0_S2_D58: string (nullable = true)
 |-- L0_S2_D62: string (nullable = true)
 |-- L0_S2_D66: string (nullable = true)
 |-- L0_S3_D70: string 

In [7]:
def summarize_date(df):
    for i in df.schema.names[1:]:
        df_date = df.select(date[i].cast(IntegerType())).dropna().agg(F.mean(i).alias('mean'), F.stddev(i).alias('std'), F.min(i).alias('min'), F.max(i).alias('max')).toPandas()
        if not df_date.empty:
            yield i, df_date.values

In [8]:
%%time
#Store mean, standard deviation and maximum time as a dictionary per production line. 
#Example L0_S0_D1 has nested list of mean, std and maximum time taken per line for processing various parts. 

mean_time = {}
std_time = {}
min_time = {}
max_time = {}

for i in summarize_date(date):
    mean_time[i[0]] = [x[0] for x in i[1]]
    std_time[i[0]] = [x[1] for x in i[1]]
    min_time[i[0]] = [x[2] for x in i[1]]
    max_time[i[0]] = [x[3] for x in i[1]]

CPU times: user 16.2 s, sys: 2.58 s, total: 18.8 s
Wall time: 3h 49min 17s


In [9]:
#Convert key-value pairs in toa dataframe
df_date = pd.DataFrame({'mean_time': mean_time, 'std_dev': std_time, 'min_time': min_time, 'max_time': max_time}).reset_index()

In [10]:
#Rename Columns
df_date.columns = ['production_line', 'mean_time', 'std_dev', 'min_time', 'max_time']

In [11]:
#View file: 
df_date

Unnamed: 0,production_line,mean_time,std_dev,min_time,max_time
0,L0_S0_D1,[881.7354829327073],[506.7221924026253],[0.0],[1713.0]
1,L0_S0_D3,[881.7354829327073],[506.7221924026253],[0.0],[1713.0]
2,L0_S0_D5,[881.7354829327073],[506.7221924026253],[0.0],[1713.0]
3,L0_S0_D7,[881.7354829327073],[506.7221924026253],[0.0],[1713.0]
4,L0_S0_D9,[881.7354829327073],[506.7221924026253],[0.0],[1713.0]
...,...,...,...,...,...
1151,L3_S51_D4255,[1022.6031944931749],[433.92253497004367],[0.0],[1457.0]
1152,L3_S51_D4257,[1022.6031944931749],[433.92253497004367],[0.0],[1457.0]
1153,L3_S51_D4259,[1022.6031944931749],[433.92253497004367],[0.0],[1457.0]
1154,L3_S51_D4261,[1022.6031944931749],[433.92253497004367],[0.0],[1457.0]


In [12]:
#Explode dataframe to flatten nested list:
df_date_exp = df_date.set_index('production_line').apply(pd.Series.explode).reset_index()

In [13]:
#View file after using explode:
df_date_exp

Unnamed: 0,production_line,mean_time,std_dev,min_time,max_time
0,L0_S0_D1,881.735,506.722,0,1713
1,L0_S0_D3,881.735,506.722,0,1713
2,L0_S0_D5,881.735,506.722,0,1713
3,L0_S0_D7,881.735,506.722,0,1713
4,L0_S0_D9,881.735,506.722,0,1713
...,...,...,...,...,...
1151,L3_S51_D4255,1022.6,433.923,0,1457
1152,L3_S51_D4257,1022.6,433.923,0,1457
1153,L3_S51_D4259,1022.6,433.923,0,1457
1154,L3_S51_D4261,1022.6,433.923,0,1457


In [14]:
#Save file as csv for further processing: 
df_date_exp.to_csv('date_summary.csv')