# Data wrangling<a id='2_Data_wrangling'></a>

## 2.2 Import Functions<a id='2.2_Introduction'></a>

In [4]:
import numpy as np
import pandas as pd
import os
import re

In [23]:
#import the file and read all lines as strings
def import_dat(batch_file, rd_path):
    """
    Function for importing .dat files. Each line is read in as a string and broken up based on spaces between entries.
    It is expected that there is a ';' in the first string chunk, and multiple ':' throughout.
    Column names are manually created and entered here so it only work for the Gas Sensor Array Drift Dataset at Different Concentrations Data Set
    
    Inputs:
        batch_file - (str) filename of the specfic batch data
        rd_path - (str) folder location of where the batch file is located
    Outputs:
        batch_data - (DataFrame) batch file data in an easier to read format
    """
    with open(rd_path + '\\' + batch_file, 'r') as file:
        lines = file.readlines()

    fd = [] # initialize a list for the formatted data
    for line in lines:
        row_list = line.split() #split the line up to get the column values for a given row
        predict_vals = [float(i[i.index(':')+1:]) for i in row_list[1:]] #extract the predictor variable values
        target_vals = [row_list[0][:row_list[0].index(';')], float(row_list[0][row_list[0].index(';') + 1:])] #extract the two target variables, chemcial number & concentration
        fd.append(predict_vals + target_vals) #add the row to the formatted data list

    col_names_pat = ['DR', '|DR|', 'EMAi0.001', 'EMAi0.01', 'EMAi0.1', 'EMAd0.001', 'EMAd0.01', 'EMAd0.1'] #variable name pattern for each sensor
    col_names = [ c + '_' + str(n) for n in range(1,17) for c in col_names_pat] #use the pattern to create column names for all 16 sensors
    col_names = col_names + ['ChemicalCode', 'Concentration'] #add the two target variable names
    batch_data = pd.DataFrame(fd, columns = col_names)
    batch_data['BatchNumber'] = re.findall(r'\d+', batch_file) * len(batch_data) #add the batch number as a feature
    
    return batch_data


In [22]:
base_fpath = os.getcwd() #the file path to the working directory of the code
rd_path = base_fpath.replace('notebooks', 'raw_data') #raw data file path

raw_files_list = os.listdir(rd_path)
for rd_file in raw_files_list:
    batch_data = import_dat(rd_file, rd_path)

### 2.2.2 Introduction To Notebook<a id='2.2.2_Introduction_To_Notebook'></a>

#### 2.6.3.1 Unique Resort Names<a id='2.6.3.1_Unique_Resort_Names'></a>