# Data Exploration

### Import Packages
Import the necessary packages that will be used in the exploration.

In [None]:
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

### Data Loader
load the data file into the notebook using pandas.

In [None]:
# siemens_amberg_labeled
File_NAME = "/siemens_amberg_labeled.csv"
SIEMENS_DATA_PATH= "./data/siemens_amberg_labeled"

# sehoaoi_labeled
# File_NAME = "/sehoaoi_labeled.csv"
SEHOAOI_DATA_PATH= "./data/sehoaoi_labeled"

IMAGE_PATH = "/exactInspImage/"

In [None]:
df = pd.read_csv(SIEMENS_DATA_PATH+File_NAME)

df.info()

In [None]:
data_to_keep = ["imageIndex", "typ", "errText", "errorClassCombined", "errorClassText", "is_good" ]
df = df[data_to_keep]

df.info()

df.head()

## Clean All 'AOI Gut Pruefung' Data 

In [None]:
# Check if 'errorClassText' column exists first to avoid errors
if 'errorClassText' in df:
    # Remove rows with "AOI Gut Pruefung" in the "errorClassText" column
    df = df[df['errorClassText'] != 'AOI Gut Pruefung']

df.head()

# Create new column from the errorClassText

In [None]:
# Create a new boolean column 'is_PseudoFehler' where the value is 1 if 'errorClassText' is "PseudoFehler", otherwise 0
df['is_PseudoFehler'] = df['errorClassText'].apply(lambda x: 1 if x == "PseudoFehler" else 0)

df.head()

PseudoFehler_counts = df['is_PseudoFehler'].value_counts()
PseudoFehler_counts


In [None]:
PseudoFehler_counts = df['is_PseudoFehler'].value_counts()
PseudoFehler_counts

In [None]:
df.head()

## Save dataframes to csv file

In [None]:
csv_file_path = './data/sehoaoi_data_original.csv'

df.to_csv(csv_file_path, index=False)

print(f"DataFrame is saved to {csv_file_path}")

### Display Function

In [None]:
def display_image(file_name, title):
    try:
        img = Image.open(file_name)
        plt.imshow(img)
        plt.title(title)
        plt.axis('off')  # Turn off axis numbers and ticks
        plt.show()
    except Exception as e:
        print(f"An error occurred while opening {file_name}: {e}")

## Type of data 
### typ

In [None]:
# Unique Value of the typ
unique_values = df['typ'].unique()
print(df['typ'].value_counts())

In [None]:
unique_types = df['typ'].unique()
for typ in unique_types:
    # Find the first image index for the current type
    image_file = df[df['typ'] == typ]['imageIndex'].iloc[0]
    print(SIEMENS_DATA_PATH+IMAGE_PATH+str(image_file))
    display_image(SIEMENS_DATA_PATH+IMAGE_PATH+str(image_file)+".png", f"Type: {typ}")

## Type of error Text
### errText

In [None]:
# Unique Value of the errText
unique_values = df['errText'].unique()
print(df['errText'].value_counts())

| German                   | English                    | Number |
|--------------------------|----------------------------|--------|
| Pin                      | Pin                        | 1621   |
| Meniskus                 | Meniscus                   | 1411   |
| Lötung                   | Soldering                  | 1110   |
| Lotperle                 | Solder bead                | 524    |
| Klemmträger verlötet     | Clamp carrier soldered     | 444    |
| no solder                | no solder                  | 89     |


In [None]:
unique_types = df['errText'].unique()
for errText in unique_types:
    # Find the first image index for the current type
    image_file = df[df['errText'] == errText]['imageIndex'].iloc[0]
    print(SIEMENS_DATA_PATH+IMAGE_PATH+str(image_file))
    display_image(SIEMENS_DATA_PATH+IMAGE_PATH+str(image_file)+".png", f"Type: {errText}")

## Type of error Class
### errorClassCombined

In [None]:
# Unique Value of the errorClassCombined
unique_values = df['errorClassCombined'].unique()
print(df['errorClassCombined'].value_counts())

| German                          | English                      | Number |
|---------------------------------|------------------------------|--------|
| good                            | good                         | 4264   |
| THT Bauteil fehlt               | THT component missing        | 281    |
| THT Bauteil liegt nicht auf     | THT component is not at      | 247    |
| THT Loetstelle offen            | THT soldering point open     | 211    |
| THT Loetbruecke                 | THT Loetbruecke              | 189    |
| THT Loetst.Form mangelhaft      | THT soldering form defective | 6      |
| THT Bauteil versetzt            | THT component offset         | 1      |


In [None]:
unique_types = df['errorClassCombined'].unique()
for errorClassCombined in unique_types:
    # Find the first image index for the current type
    image_file = df[df['errorClassCombined'] == errorClassCombined]['imageIndex'].iloc[0]
    print(SIEMENS_DATA_PATH+IMAGE_PATH+str(image_file))
    display_image(SIEMENS_DATA_PATH+IMAGE_PATH+str(image_file)+".png", f"Type: {errorClassCombined}")

## Type of errorClassText
### errorClassText

In [None]:
# Unique Value of the errorClassText
unique_values = df['errorClassText'].unique()
print(df['errorClassText'].value_counts())