In [7]:
import pandas as pd

In [8]:
def clean_price(df):

    df["price"] = df["price"].str.replace("£", "").str.strip()

    """drop rows if price missing"""
    df = df[df.price.str.len() > 1]

    return df

In [9]:
def fix_product_img_url(df):
    df["product_img_url"] = (
        "https://www.medema.co.uk"
        + df["product_img_url"].str.split("|", expand=True)[0].str[1:]
    )
    return df

In [10]:
def dtype_to_str(df):

    df.fillna("", inplace=True)

    """set dtype to str for each column"""
    for column in df.columns:
        df[column] = df[column].astype(str)

    return df

In [11]:
def transform_data():

    file_path = "out/medema.co.uk/"
    input_file_name = file_path + "medema_raw_data"
    out_file_name = file_path + "medema"

    df = pd.read_csv("raw_data.csv")

    df = dtype_to_str(df)

    """apply steps"""
    df = df.pipe(clean_price).pipe(fix_product_img_url)

    cols = [
        "product_code",
        "title",
        "product_description",
        "currency",
        "price",
        "product_quantity_raw",
        "price_per_product_quantity_raw",
        "tax_details",
        "additional_manufacturer",
        "manufacturers",
        "product_info_url",
        "country",
        "product_img_url",
        "availability",
    ]

    df = dtype_to_str(df)

    df = df.drop_duplicates()
    df[cols].to_csv("output.csv", index=False)
#     df[cols].to_parquet(out_file_name + ".parquet", index=False)

In [12]:

transform_data()