Processing tasks:

- Split the name field into first_name, and last_name
- Remove any zeros prepended to the price field
- Delete any rows which do not have a name
- Create a new field named above_100, which is true if the price is strictly greater than 100

In [1]:
import pandas as pd

In [2]:
dataset1 = pd.read_csv('/data/dataset1.csv')
dataset2 = pd.read_csv('/data/dataset2.csv')

In [3]:
import os

In [4]:
dataset1

Unnamed: 0,name,price
0,William Dixon,109.037280
1,Kristen Horn,262.524652
2,Kimberly Chang,187.007258
3,Mary Ball,283.174648
4,Benjamin Craig,143.871582
...,...,...
4995,Shirley Nguyen,9.011665
4996,Jesse Brown,247.328232
4997,Valerie Owens,238.103714
4998,Alicia Sharp,243.622929


In [5]:
dataset2["price"].iloc[0]

'258.1809089'

## Approach:
1. drop na value from name column and remove empty string rows from name column 
2. use helper functions get_first_last_name and rm_preprended_zeros_price for getting the firstname and lastname, and remove prepended zero prices respectively
3. Find price > 100 and put into a new column
4. save to location inside folder /data/completed/ 

In [6]:
def get_first_last_name(_name):
    """
    Assume all english, separated by white space , get first name and last_name
    
    Params:
    -------
    _name: string 
    
    Returns:
    --------
    a list, representing the [first name, last_name ]
    """
    return _name.split(" ")

In [7]:
def rm_preprended_zeros_price(_price):
    """
    remove whitespaces prepended to price 
    
    Params:
    -------
    _price: obj , assumed either int/float, string
    
    Returns:
    --------
    edited string with whitespaces removed, or float as it is  
    """
    if type(_price) == str:
        return _price.lstrip()
    else:
        return _price


In [8]:
def data_preprocess(fp):
    """
    Given absolute fp for the dataset to be processed,
    Run the processing tasks by:
        1. Delete any rows which do not have a name - drop rows where column name is empty
        2. split name field to first_name and last_name - use helper function - get_first_last_name
        3. rm any zeros prepended to the price field - use helper function - rm_preprended_zeros_price
        4. create new col if price > 100
    
    Params:
    -------
    fp: absolute file path
    
    Returns:
    --------
    dataframe or string 
    """
    # read in the file 
    if os.path.isfile(fp):

        dataset = pd.read_csv(fp)
        # step 1: drop na + rm empty string
        dataset_nona = dataset.dropna(axis = 0, subset = ["name"])
        dataset_nona_no_emp_str = dataset_nona[dataset_nona["name"] != ""]

        # step 2:
        dataset_nona_no_emp_str["name"] = dataset_nona_no_emp_str["name"].apply(lambda x: get_first_last_name(x))

        #step 3:
        dataset_nona_no_emp_str["price"] = dataset_nona_no_emp_str["price"].apply(lambda x: rm_preprended_zeros_price(x))

        #step 4:
        dataset_nona_no_emp_str["above_100"] = dataset_nona_no_emp_str["price"].apply(lambda x: True if float(x) > 100 else False)
      
        return dataset_nona_no_emp_str
    else:
        return "file does not exist!"
    

In [9]:
data_preprocess('/data/dataset1.csv')

Unnamed: 0,name,price,above_100
0,"[William, Dixon]",109.037280,True
1,"[Kristen, Horn]",262.524652,True
2,"[Kimberly, Chang]",187.007258,True
3,"[Mary, Ball]",283.174648,True
4,"[Benjamin, Craig]",143.871582,True
...,...,...,...
4995,"[Shirley, Nguyen]",9.011665,False
4996,"[Jesse, Brown]",247.328232,True
4997,"[Valerie, Owens]",238.103714,True
4998,"[Alicia, Sharp]",243.622929,True


In [10]:
data_preprocess('/data/dataset2.csv')

Unnamed: 0,name,price,above_100
0,"[William, Garcia]",258.1809089,True
1,"[Barbara, Freeman]",141.890534,True
2,"[Rebecca, Zimmerman]",293.373272,True
3,"[Patricia, Velasquez]",249.9479246,True
4,"[Ronnie, Clark]",272.908659,True
...,...,...,...
4995,"[Rachel, Davis]",95.25395533,False
4996,"[Connie, Hamilton]",205.3966853,True
4997,"[Sean, Kaiser]",80.54737145,False
4998,"[Ebony, Rodriguez]",93.20561606,False


### check the output after processing 

In [11]:
dataset2_completed = pd.read_csv('/data/completed/dataset2.csv')

In [12]:
dataset2_completed

Unnamed: 0.1,Unnamed: 0,name,price,above_100
0,0,"['William', 'Garcia']",258.180909,True
1,1,"['Barbara', 'Freeman']",141.890534,True
2,2,"['Rebecca', 'Zimmerman']",293.373272,True
3,3,"['Patricia', 'Velasquez']",249.947925,True
4,4,"['Ronnie', 'Clark']",272.908659,True
...,...,...,...,...
4956,4995,"['Rachel', 'Davis']",95.253955,False
4957,4996,"['Connie', 'Hamilton']",205.396685,True
4958,4997,"['Sean', 'Kaiser']",80.547371,False
4959,4998,"['Ebony', 'Rodriguez']",93.205616,False


In [13]:
dataset1_completed = pd.read_csv('/data/completed/dataset1.csv')

In [14]:
dataset1_completed

Unnamed: 0.1,Unnamed: 0,name,price,above_100
0,0,"['William', 'Dixon']",109.037280,True
1,1,"['Kristen', 'Horn']",262.524652,True
2,2,"['Kimberly', 'Chang']",187.007258,True
3,3,"['Mary', 'Ball']",283.174648,True
4,4,"['Benjamin', 'Craig']",143.871582,True
...,...,...,...,...
4995,4995,"['Shirley', 'Nguyen']",9.011665,False
4996,4996,"['Jesse', 'Brown']",247.328232,True
4997,4997,"['Valerie', 'Owens']",238.103714,True
4998,4998,"['Alicia', 'Sharp']",243.622929,True
