In [None]:
# the packages
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import os
import re
from dotenv import load_dotenv

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
load_dotenv()

db_access = os.environ['MYSQL_PROD_CON']
db_access

## Load the Data

In [None]:
# access the db
con_db = create_engine(db_access)

In [None]:
# set the outlet
name = "Timurasa Indonesia"
# run the query
q = f"""
SELECT o.name as outlet, 
       p.id,
       p.name as product,
       pm.id AS media_id,
       pm.media AS media_name
FROM products p
JOIN outlets o ON o.id = p.outlet_id
LEFT JOIN product_medias pm ON pm.product_id = p.id
WHERE o.name = "{name}" and p.is_deleted = 0 and pm.media = "default-product-pict.jpg"
group by p.id
"""

In [None]:
# store to dataframe
df_mp = pd.read_sql(q, con=con_db)
df_mp.info()

In [None]:
# optional if you have the requested data
df_mp = pd.read_csv("data/bestmeatshop-images-null-info.csv")
df_mp.info()

## Add media name into the dataframe

In [None]:
# make sure the outlet spelling name is same with the prefix name of list, if dont, set the outlet name in dataframe
df_mp["outlet"] = "bestmeat"

In [None]:
# convertion filename function
def convertmedianame(name, outlet):
    outlet = re.sub(r'([^\s\w]|_)+', '', outlet)
    name = name.replace("/","")
    name = name.replace("'","")
    return outlet + "_" + name + ".jpg"

In [None]:
# get the media name using the convertion function
df_mp['media_name'] = df_mp.apply(lambda x: convertmedianame(x['product'], x['outlet']), axis=1)

In [None]:
# sorting the data based on media name
df_mp = df_mp.sort_values(by=["media_name"])

In [None]:
# check the data in detail
df_mp

## Load Filename List

In [None]:
# set the name regarding your folder name
name = "bestmeat"
path = f"data/images_{name}"

In [None]:
# access the files in the folder and check it
path, dirs, files = next(os.walk(path))
file_count = len(files)
print(file_count)

In [None]:
# create the new dataframe for images data
df_il = pd.DataFrame(columns=["media_name"])

In [None]:
# store the media name of images to dataframe

In [None]:
media_name = []

In [None]:
for root, dirs, files in os.walk(path):
    for filename in files:
        media_name.append(filename)

In [None]:
df_il["media_name"] = media_name
df_il = df_il.sort_values(by=["media_name"])

## Check image-data Availability

In [None]:
# product which not have media
df_pnm = df_mp.loc[~df_mp.media_name.isin(df_il.media_name.unique().tolist())]
len(df_pnm)

In [None]:
# check in detail
df_pnm

In [None]:
# media which not have product
df_mnp = df_il.loc[~df_il.media_name.isin(df_mp.media_name.unique().tolist())]
len(df_mnp)

In [None]:
# check in detail
df_mnp

## Additional process

In [None]:
# uncomment this cell if need to get link for completing images
# df_pnm["media_name"].to_csv(f"data/product_images_{name}_2.csv", index=False)

In [None]:
# product which have media
df_pnm2 = df_mp.loc[df_mp.media_name.isin(df_il.media_name.unique().tolist())]
len(df_pnm2)

In [None]:
# delete images if necessary
for x in df_mnp.media_name:
    os.remove(os.path.join(path, x))

In [None]:
# check the duplicated data
df_test = df_pnm2[df_pnm2.duplicated("product")]
df_test

In [None]:
# drop the duplicated data if needed
# df_pnm2 = df_pnm2.drop_duplicates(subset="product", keep="first")

## Create query for images media

In [None]:
# check the updated data in detail
# df_pnm2

In [None]:
# create the queries for updating the product media
dt_sql = []
for index, dt in df_pnm2.iterrows():
    dt_sql.append("UPDATE food.product_medias SET media = '{}' WHERE id = {};"
                  .format(dt['media_name'],dt['media_id']))

In [None]:
# store the queries to file
df_sql = pd.DataFrame()
df_sql['query'] = dt_sql
np.savetxt(f"data/update_images_{name}.txt", df_sql.values, fmt = "%s")