### Objective: Extract Summarized Numerical features of Bosch assembly line

In [1]:
#Import Packages: 

from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
from pyspark.sql.types import *
import zipfile
import pandas as pd
import numpy as np
import vaex
from pyspark.sql import functions as F
import re

In [2]:
#Create Spark sesison:

spark = SparkSession\
        .builder\
        .master("local[*]")\
        .appName('Spark Project')\
        .getOrCreate()
spark

In [3]:
#Read csv files: 
#category_keys = spark.read.csv('../data/components_summary_with_keys.csv', header = True)
#date  = spark.read.csv('../data/train_date.csv', header = True)
numeric = spark.read.csv('../data/train_numeric.csv', header = True)

In [4]:
#Create Temporary View to run SQl commands:
#category.createOrReplaceTempView('category')
#date.createOrReplaceTempView('date')
numeric.createOrReplaceTempView('numeric')

In [5]:
numeric.limit(10).toPandas()

Unnamed: 0,Id,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,...,L3_S50_F4245,L3_S50_F4247,L3_S50_F4249,L3_S50_F4251,L3_S50_F4253,L3_S51_F4256,L3_S51_F4258,L3_S51_F4260,L3_S51_F4262,Response
0,4,0.03,-0.034,-0.197,-0.179,0.118,0.116,-0.015,-0.032,0.02,...,,,,,,,,,,0
1,6,,,,,,,,,,...,,,,,,,,,,0
2,7,0.088,0.086,0.003,-0.052,0.161,0.025,-0.015,-0.072,-0.225,...,,,,,,,,,,0
3,9,-0.036,-0.064,0.294,0.33,0.074,0.161,0.022,0.128,-0.026,...,,,,,,,,,,0
4,11,-0.055,-0.086,0.294,0.33,0.118,0.025,0.03,0.168,-0.169,...,,,,,,,,,,0
5,13,0.003,0.019,0.294,0.312,0.031,0.161,0.022,0.088,-0.005,...,,,,,,,,,,0
6,14,,,,,,,,,,...,,,,,,,,,,0
7,16,,,,,,,,,,...,,,,,,,,,,0
8,18,-0.016,-0.041,-0.179,-0.179,-0.056,0.161,-0.007,-0.032,-0.082,...,,,,,,,,,,0
9,23,,,,,,,,,,...,,,,,,,,,,0


In [6]:
numeric.printSchema()

root
 |-- Id: string (nullable = true)
 |-- L0_S0_F0: string (nullable = true)
 |-- L0_S0_F2: string (nullable = true)
 |-- L0_S0_F4: string (nullable = true)
 |-- L0_S0_F6: string (nullable = true)
 |-- L0_S0_F8: string (nullable = true)
 |-- L0_S0_F10: string (nullable = true)
 |-- L0_S0_F12: string (nullable = true)
 |-- L0_S0_F14: string (nullable = true)
 |-- L0_S0_F16: string (nullable = true)
 |-- L0_S0_F18: string (nullable = true)
 |-- L0_S0_F20: string (nullable = true)
 |-- L0_S0_F22: string (nullable = true)
 |-- L0_S1_F24: string (nullable = true)
 |-- L0_S1_F28: string (nullable = true)
 |-- L0_S2_F32: string (nullable = true)
 |-- L0_S2_F36: string (nullable = true)
 |-- L0_S2_F40: string (nullable = true)
 |-- L0_S2_F44: string (nullable = true)
 |-- L0_S2_F48: string (nullable = true)
 |-- L0_S2_F52: string (nullable = true)
 |-- L0_S2_F56: string (nullable = true)
 |-- L0_S2_F60: string (nullable = true)
 |-- L0_S2_F64: string (nullable = true)
 |-- L0_S3_F68: string 

In [7]:
def summarize_num(df):
    for i in df.schema.names[1:]:
        df_num = df.select(i, 'Response').dropna().groupBy(i, 'Response').count().orderBy('Response', 'count', ascending = False).limit(1000).toPandas()
        if not df_num.empty:
            yield df_num # pd.DataFrame(df_num,columns = [df_num.columns[0], df_num.columns[1], 'count'])

In [8]:
%%time
#Storing test values, response and count as as a dictionary of nested list. 
#Eaxmple:key L0_S0_F0 has a nested list of test values [0.003, 0.03...], response[1,0] and count[100, 130...]  


production_line = {}
response= {}
count = {}
for j in summarize_num(numeric):
    production_line[j.columns[0]] = [x[0] for x in j.values]
    response[j.columns[0]] = [x[1] for x in j.values]
    count[j.columns[0]] = [x[2] for x in j.values]

CPU times: user 11.8 s, sys: 1.72 s, total: 13.5 s
Wall time: 2h 12min 10s


In [9]:
#Converting key-value pairs in to a dataframe: 
df_numeric = pd.DataFrame({'production_line':production_line, 'response':response, 'count':count}).reset_index()

In [10]:
#Rename columns names: 
df_numeric.columns = ['production_line', 'test_values', 'response', 'count']

In [12]:
#Save file as csv for further processing: 
df_numeric.to_csv('numeric_summary.csv')

In [13]:
#Next we will calculate mean, std on test values and count grouped by production_line and response values:
#This will help us understand the central tendency of the data. 
# We will also check the distribution to see if test values are normally distributed. 
#pd.to_numeric(df_numeric['test_values'][0]).mean()

In [14]:
#Explode nested list such as test_values, response and count so that we can identify mean and std in test values. 
df_numeric_exp = df_numeric.set_index('production_line').apply(pd.Series.explode).reset_index()

In [15]:
#Assess datatype
df_numeric_exp.info()
#Here we will need to convert test-values and count to float values:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 407304 entries, 0 to 407303
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   production_line  407304 non-null  object
 1   test_values      407304 non-null  object
 2   response         407304 non-null  object
 3   count            407304 non-null  object
dtypes: object(4)
memory usage: 12.4+ MB


In [16]:
#Exploded view: 
df_numeric_exp

#Here we noticed that last two values at index number 407302 and 407303 , response variable shows us imbalance in inspection passed(0) Vs failed(1)
#We will remove this from the dataset for now so that we can analyze just the production line statistics. 

df_numeric_exp = df_numeric_exp[:-2]

In [17]:
#Convert test_values to numeric values first: 
df_numeric_exp['test_values'] = pd.to_numeric(df_numeric_exp['test_values'])
df_numeric_exp['count'] = pd.to_numeric(df_numeric_exp['count'])

In [18]:
df_numeric_grouped = df_numeric_exp.groupby(['production_line', 'response']).agg({'test_values': ['mean', 'std'], 'count': 'mean'})#.reset_index()#.stack()

In [24]:
df_numeric_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,test_values,test_values,count
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean
production_line,response,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
L0_S0_F0,0,-0.099422,0.222564,5778.051724
L0_S0_F0,1,-0.016532,0.151396,45.670886
L0_S0_F10,0,-0.130318,0.302040,30466.090909
L0_S0_F10,1,-0.179889,0.242620,200.444444
L0_S0_F12,0,0.113607,0.243375,23937.642857
...,...,...,...,...
L3_S51_F4258,1,0.000500,0.000707,151.000000
L3_S51_F4260,0,0.129337,0.175427,744.387500
L3_S51_F4260,1,0.002000,0.002000,100.666667
L3_S51_F4262,0,0.051629,0.167102,1701.457143


In [25]:
df_numeric_grouped.columns  = ['test_values_mean' , 'test_values_std', 'count_mean'] 

In [26]:
df_numeric_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,test_values_mean,test_values_std,count_mean
production_line,response,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
L0_S0_F0,0,-0.099422,0.222564,5778.051724
L0_S0_F0,1,-0.016532,0.151396,45.670886
L0_S0_F10,0,-0.130318,0.302040,30466.090909
L0_S0_F10,1,-0.179889,0.242620,200.444444
L0_S0_F12,0,0.113607,0.243375,23937.642857
...,...,...,...,...
L3_S51_F4258,1,0.000500,0.000707,151.000000
L3_S51_F4260,0,0.129337,0.175427,744.387500
L3_S51_F4260,1,0.002000,0.002000,100.666667
L3_S51_F4262,0,0.051629,0.167102,1701.457143


In [28]:
df_numeric_grouped.to_csv('numeric_grouped.csv')