In [2]:
import pandas as pd
import re

In [93]:
trademark_codes = pd.read_csv('./data/csv/trademark_codes.csv') # 1586 rows x 2 columns: code + des
trademark_updated = pd.read_csv('./data/csv/trademark_updated.csv') # 158511 rows x 4 columns: serial No 
trademark = pd.read_csv('./data/csv/trademark.csv') # 158511 rows x 4 columns

In [94]:
# trademark_codes.head(5)
# trademark_updated.head(5)
# trademark_updated['Design Search Code(s)']
# trademark.loc[5, 'Design Search Code(s)']
trademark.head(10)

Unnamed: 0,serial No,Mark Literal Elements,Description of Mark,Design Search Code(s)
0,71000233,,,"02.03.04 - Women, Pilgrims, women in colonial..."
1,71002779,CAMPBELL'S,"The mark consists of a band, the upper half of...",26.11.10 - Rectangles divided once into two s...
2,71012280,AUNT JEMIMA,THE TRADE-MARK CONSISTS OF THE REPRESENTATION ...,"02.03.01 - Women - head, portraiture or busts..."
3,71083280,WHEN IT RAINS IT POURS,,01.15.07 - Snow in a storm; Sleet; Rain; Hail...
4,71124272,,,"02.03.13 - Women, other women wearing folk, h..."
5,71149311,SLEEP-ON,,"02.05.05 - Children, boy(s); Boys 02.05.06 -..."
6,71154587,EVERY BIT A DELIGHT,,26.01.02 - Plain single line circles; Circles...
7,71172379,ICE CREAM MAKERS' FRIEND,THE PICTURE SHOWN IN THE ILLUSTRATION IS FANCI...,"02.01.04 - Robes, men wearing; Shepherds; Rel..."
8,71173691,BRH,,"24.15.10 - More than one arrow; Arrows, more ..."
9,71190472,,,26.05.21 - Triangles that are completely or p...


In [95]:
trademark.loc[2, 'Design Search Code(s)']

'02.03.01 -  Women - head, portraiture or busts facing forward; Portraiture of women facing forward; Heads of women facing forward; Busts of women facing forward  02.03.15 -  Women wearing scarves on their heads; Scarves (women wearing on heads)'

In [None]:
def extract_codes_and_descriptions(text):
    if pd.isna(text):  # Handle NaN/None values
        return {}
        
    # Split by space + code pattern to separate multiple entries
    pattern = r'(\d{2}\.\d{2}\.\d{2})\s*-\s*([^0-9]+?)(?=\s+\d{2}\.\d{2}\.\d{2}|$)'
    matches = re.finditer(pattern, text.strip())
    
    codes_desc = {}
    for match in matches:
        code = match.group(1)
        description = match.group(2).strip()
        codes_desc[code] = description
    
    return codes_desc

# pretrain[pretrain['target_h2'].notna()]
extracted_tmp = extract_codes_and_descriptions(trademark.loc[2, 'Design Search Code(s)'])
# extracted_tmp = extract_codes_and_descriptions(trademark[trademark['serial No'] == "89002325"])
print(extracted_tmp)

{'02.03.01': 'Women - head, portraiture or busts facing forward; Portraiture of women facing forward; Heads of women facing forward; Busts of women facing forward', '02.03.15': 'Women wearing scarves on their heads; Scarves (women wearing on heads)'}


In [98]:
def categorize_codes_with_descriptions(codes_desc):
    h1_codes = {} 
    h2_codes = {}
    h3_codes = {}
    
    for code, desc in codes_desc.items():
        dot_count = code.count('.')
        if dot_count == 0:
            h1_codes[code] = desc
        elif dot_count == 1:
            h2_codes[code] = desc
        elif dot_count == 2:
            h3_codes[code] = desc
    
    return h1_codes, h2_codes, h3_codes

print(categorize_codes_with_descriptions(extracted_tmp))

({}, {}, {'02.03.01': 'Women - head, portraiture or busts facing forward; Portraiture of women facing forward; Heads of women facing forward; Busts of women facing forward', '02.03.15': 'Women wearing scarves on their heads; Scarves (women wearing on heads)'})


In [99]:
def process_trademark_file(input_df, output_file):
    df = input_df.copy()
    string_columns = ['serial No', 'Design Search Code(s)']
    for col in string_columns:
        df[col] = df[col].astype(str).str.strip()
    
    output_data = []
    
    for _, row in df.iterrows():
        serial_no = row['serial No']
        design_codes = row['Design Search Code(s)']
        
        codes_desc = extract_codes_and_descriptions(design_codes)
        h1_codes, h2_codes, h3_codes = categorize_codes_with_descriptions(codes_desc)
        
        output_row = {
            'image_name': f'{serial_no}.jpg',
            'target': ','.join(codes_desc.keys()),
            'target_description': design_codes,
            'target_h1': ','.join(sorted(h1_codes.keys())),
            'target_h1_description': ' | '.join([f"{code} - {desc}" for code, desc in sorted(h1_codes.items())]),
            'target_h2': ','.join(sorted(h2_codes.keys())),
            'target_h2_description': ' | '.join([f"{code} - {desc}" for code, desc in sorted(h2_codes.items())]),
            'target_h3': ','.join(sorted(h3_codes.keys())),
            'target_h3_description': ' | '.join([f"{code} - {desc}" for code, desc in sorted(h3_codes.items())])
        }
        
        output_data.append(output_row)
    
    output_df = pd.DataFrame(output_data)
    output_df.to_csv(output_file, index=False)
    
process_trademark_file(trademark, "./data/csv/pretrain.csv")

In [100]:
pretrain = pd.read_csv('./data/csv/pretrain.csv')
pretrain

Unnamed: 0,image_name,target,target_description,target_h1,target_h1_description,target_h2,target_h2_description,target_h3,target_h3_description
0,71000233.jpg,"02.03.04,02.03.14,02.03.21,02.09.06","02.03.04 - Women, Pilgrims, women in colonial...",,,,,"02.03.04,02.03.14,02.03.21,02.09.06","02.03.04 - Women, Pilgrims, women in colonial ..."
1,71002779.jpg,"26.11.10,26.11.21",26.11.10 - Rectangles divided once into two s...,,,,,"26.11.10,26.11.21",26.11.10 - Rectangles divided once into two se...
2,71012280.jpg,"02.03.01,02.03.15","02.03.01 - Women - head, portraiture or busts...",,,,,"02.03.01,02.03.15","02.03.01 - Women - head, portraiture or busts ..."
3,71083280.jpg,"01.15.07,02.05.04,02.09.06,10.03.03,19.07.25,2...",01.15.07 - Snow in a storm; Sleet; Rain; Hail...,,,,,"01.15.07,02.05.04,02.09.06,10.03.03,19.07.25,2...",01.15.07 - Snow in a storm; Sleet; Rain; Hail ...
4,71124272.jpg,"02.03.13,02.03.25,05.03.10,09.01.04","02.03.13 - Women, other women wearing folk, h...",,,,,"02.03.13,02.03.25,05.03.10,09.01.04","02.03.13 - Women, other women wearing folk, hi..."
...,...,...,...,...,...,...,...,...,...
158506,89002321.jpg,"05.05.05,05.05.25,26.11.01",05.05.05 - Daisies 05.05.25 - Other flowers...,,,,,"05.05.05,05.05.25,26.11.01",05.05.05 - Daisies | 05.05.25 - Other flowers;...
158507,89002322.jpg,"05.05.05,05.05.25",05.05.05 - Daisies 05.05.25 - Iris (flower...,,,,,"05.05.05,05.05.25",05.05.05 - Daisies | 05.05.25 - Iris (flower)...
158508,89002325.jpg,01.01.13,01.01.13 - Stars - multiple stars with five p...,,,,,01.01.13,01.01.13 - Stars - multiple stars with five po...
158509,89002386.jpg,"01.15.05,10.01.04,23.01.03,24.15.10,26.01.17",01.15.05 - Vapor; Steam; Smoke 10.01.04 - P...,,,,,"01.15.05,10.01.04,23.01.03,24.15.10,26.01.17",01.15.05 - Vapor; Steam; Smoke | 10.01.04 - Pi...


In [102]:
def fill_hierarchical_levels(df):
    # Create a copy of the input dataframe
    df_filled = df.copy()
    
    def extract_levels(target_str):
        if pd.isna(target_str):
            return [], [], []
            
        codes = target_str.split(',')
        h1_codes = set()
        h2_codes = set()
        h3_codes = codes  # h3 is same as target
        
        for code in codes:
            parts = code.split('.')
            if len(parts) >= 1:
                h1_codes.add(parts[0])
            if len(parts) >= 2:
                h2_codes.add('.'.join(parts[:2]))
                
        return (
            ','.join(sorted(h1_codes)),
            ','.join(sorted(h2_codes)),
            ','.join(sorted(h3_codes))
        )
    
    # Process each row
    for idx, row in df_filled.iterrows():
        if pd.isna(row['target_h1']) or pd.isna(row['target_h2']):  # Only fill if h1 or h2 is NaN
            h1, h2, h3 = extract_levels(row['target'])
            df_filled.at[idx, 'target_h1'] = h1
            df_filled.at[idx, 'target_h2'] = h2
            df_filled.at[idx, 'target_h3'] = h3
    
    return df_filled

# Use the function
pretrain_filled = fill_hierarchical_levels(pretrain)

# Save to new CSV
pretrain_filled.to_csv('./data/csv/pretrain_fill.csv', index=False)

  df_filled.at[idx, 'target_h1'] = h1
  df_filled.at[idx, 'target_h2'] = h2


In [103]:
pretrain_filled

Unnamed: 0,image_name,target,target_description,target_h1,target_h1_description,target_h2,target_h2_description,target_h3,target_h3_description
0,71000233.jpg,"02.03.04,02.03.14,02.03.21,02.09.06","02.03.04 - Women, Pilgrims, women in colonial...",02,,"02.03,02.09",,"02.03.04,02.03.14,02.03.21,02.09.06","02.03.04 - Women, Pilgrims, women in colonial ..."
1,71002779.jpg,"26.11.10,26.11.21",26.11.10 - Rectangles divided once into two s...,26,,26.11,,"26.11.10,26.11.21",26.11.10 - Rectangles divided once into two se...
2,71012280.jpg,"02.03.01,02.03.15","02.03.01 - Women - head, portraiture or busts...",02,,02.03,,"02.03.01,02.03.15","02.03.01 - Women - head, portraiture or busts ..."
3,71083280.jpg,"01.15.07,02.05.04,02.09.06,10.03.03,19.07.25,2...",01.15.07 - Snow in a storm; Sleet; Rain; Hail...,0102101926,,"01.15,02.05,02.09,10.03,19.07,26.11,26.15",,"01.15.07,02.05.04,02.09.06,10.03.03,19.07.25,2...",01.15.07 - Snow in a storm; Sleet; Rain; Hail ...
4,71124272.jpg,"02.03.13,02.03.25,05.03.10,09.01.04","02.03.13 - Women, other women wearing folk, h...",020509,,"02.03,05.03,09.01",,"02.03.13,02.03.25,05.03.10,09.01.04","02.03.13 - Women, other women wearing folk, hi..."
...,...,...,...,...,...,...,...,...,...
158506,89002321.jpg,"05.05.05,05.05.25,26.11.01",05.05.05 - Daisies 05.05.25 - Other flowers...,0526,,"05.05,26.11",,"05.05.05,05.05.25,26.11.01",05.05.05 - Daisies | 05.05.25 - Other flowers;...
158507,89002322.jpg,"05.05.05,05.05.25",05.05.05 - Daisies 05.05.25 - Iris (flower...,05,,05.05,,"05.05.05,05.05.25",05.05.05 - Daisies | 05.05.25 - Iris (flower)...
158508,89002325.jpg,01.01.13,01.01.13 - Stars - multiple stars with five p...,01,,01.01,,01.01.13,01.01.13 - Stars - multiple stars with five po...
158509,89002386.jpg,"01.15.05,10.01.04,23.01.03,24.15.10,26.01.17",01.15.05 - Vapor; Steam; Smoke 10.01.04 - P...,0110232426,,"01.15,10.01,23.01,24.15,26.01",,"01.15.05,10.01.04,23.01.03,24.15.10,26.01.17",01.15.05 - Vapor; Steam; Smoke | 10.01.04 - Pi...
