In [9]:
import re
import pandas as pd

# Define the unit abbreviations dictionary
unit_abbreviations = {
    'item_weight': {
        'gram': ['g', 'gm', 'G', 'GM', 'gram'],
        'milligram': ['mg', 'mG', 'Milligram', 'milligram'],
        'kilogram': ['kg', 'KG', 'Kilogram', 'kilogram'],
        'ounce': ['oz', 'OZ', 'Ounce', 'ounce'],
        'pound': ['lb', 'LB', 'Pound', 'pound'],
        'ton': ['t', 'T', 'Ton', 'ton'],
        'microgram': ['µg', 'mcg', 'Microgram', 'microgram']
    },
    'item_volume': {
        'cup': ['cup', 'Cup'],
        'gallon': ['gal', 'Gallon', 'gallon'],
        'ounce': ['oz', 'OZ', 'Ounce', 'ounce'],
        'millilitre': ['ml', 'mL', 'Millilitre', 'millilitre'],
        'cubic': ['cubic', 'Cubic'],
        'fluid': ['fluid', 'Fluid'],
        'decilitre': ['dl', 'dL', 'Decilitre', 'decilitre'],
        'litre': ['l', 'L', 'Litre', 'litre'],
        'quart': ['qt', 'Quart', 'quart'],
        'pint': ['pt', 'Pint', 'pint'],
        'centilitre': ['cl', 'cL', 'Centilitre', 'centilitre']
    },
    'voltage': {
        'volt': ['v', 'V', 'Volt', 'volt']
    },
    'wattage': {
        'watt': ['w', 'W', 'Watt', 'watt'],
        'horsepower': ['hp', 'HP', 'Horsepower', 'horsepower'],
        'kilowatt': ['kw', 'KW', 'Kilowatt', 'kilowatt'],
        'milliampere': ['mA', 'MA', 'Milliampere', 'milliampere']
    },
    'maximum_weight_recommendation': {
        'kilogram': ['kg', 'KG', 'Kilogram', 'kilogram'],
        'pound': ['lb', 'LB', 'Pound', 'pound'],
        'ounce': ['oz', 'OZ', 'Ounce', 'ounce'],
        'ton': ['t', 'T', 'Ton', 'ton'],
        'milligram': ['mg', 'mG', 'Milligram', 'milligram'],
        'gram': ['g', 'gm', 'G', 'GM', 'gram'],
        'microgram': ['µg', 'mcg', 'Microgram', 'microgram']
    },
    'height': {
        'centimetre': ['cm', 'CM', 'Centimetre', 'centimetre'],
        'millimetre': ['mm', 'MM', 'Millimetre', 'millimetre'],
        'inch': ['in', 'IN', 'Inch', 'inch'],
        'metre': ['m', 'M', 'Metre', 'metre'],
        'foot': ['ft', 'FT', 'Foot', 'foot']
    },
    'depth': {
        'centimetre': ['cm', 'CM', 'Centimetre', 'centimetre'],
        'millimetre': ['mm', 'MM', 'Millimetre', 'millimetre'],
        'inch': ['in', 'IN', 'Inch', 'inch'],
        'metre': ['m', 'M', 'Metre', 'metre'],
        'foot': ['ft', 'FT', 'Foot', 'foot']
    },
    'width': {
        'millimetre': ['mm', 'MM', 'Millimetre', 'millimetre'],
        'centimetre': ['cm', 'CM', 'Centimetre', 'centimetre'],
        'inch': ['in', 'IN', 'Inch', 'inch'],
        'foot': ['ft', 'FT', 'Foot', 'foot'],
        'metre': ['m', 'M', 'Metre', 'metre']
    }
}

# Create a reverse dictionary to map abbreviations to full units
unit_map = {}
for category, units in unit_abbreviations.items():
    for full_unit, abbreviations in units.items():
        for abbr in abbreviations:
            unit_map[abbr.lower()] = full_unit

# Regex pattern to find numbers followed by units, handling optional spaces and punctuation
pattern = re.compile(r'(\d+\.?\d*)\s*([a-zA-Zµ]+)', re.IGNORECASE)

def extract_units(text):
    # Find all matches in the text
    matches = pattern.findall(text)
    results = []
    for match in matches:
        number, unit = match
        unit = unit.lower()
        # Clean up unit by removing any trailing characters that are not part of the unit
        unit = re.sub(r'[^\w]', '', unit)
        if unit in unit_map:
            full_unit = unit_map[unit]
            results.append(f"{number} {full_unit}")
    return ' '.join(results)

# Load the CSV file
data = pd.read_csv('output_test_1.csv')
df = pd.DataFrame(data)
# Check for non-string values
df['ocr_text'] = df['ocr_text'].astype(str)

# # Apply the function to create a new column 'extracted_units'
# df['extracted_units'] = df['ocr_text'].apply(extract_units)

# # Select the columns to save in the new CSV
# df_output = df[['image_link', 'entity_name', 'extracted_units']]

# # Save the DataFrame to a new CSV file
# output_csv_path = '/content/1_502_testing.csv'
# df_output.to_csv(output_csv_path, index=False)

# # Display the first few rows of the new DataFrame
# df_output.head()


# Apply the function to create a new column 'extracted_units'
df['extracted_units'] = df['ocr_text'].apply(extract_units)

# Add 'df_index' and 'prediction' columns
df['df_index'] = df.index  # Add the DataFrame index as a new column named 'df_index'
df['prediction'] = df['extracted_units']  # Add the 'prediction' column, initialized with empty values
# Select the columns to save in the new CSV
df_output = df[['index', 'prediction']]

# Save the DataFrame to a new CSV file
output_csv_path = '1_502_testing.csv'
df_output.to_csv(output_csv_path, index=False)
df_output

Unnamed: 0,index,prediction
0,0,
1,1,42 centimetre 200 centimetre
2,2,42 centimetre 200 centimetre
3,3,42 centimetre 200 centimetre
4,4,10.50 centimetre 90 centimetre
...,...,...
497,497,6.37 inch 16.18 centimetre 1.41 inch 3.6 centi...
498,498,6 inch 7 centimetre 7 centimetre
499,499,6 inch 7 centimetre 7 centimetre
500,500,4.8 centimetre


In [None]:
df_test=pd.read_csv('test.csv')
df_test=df_test['index']    
df_test

In [13]:
import random

# Determine the current number of rows in the DataFrame
current_row_count = len(df)

# Define the target number of rows
target_row_count = 131287

# Calculate the number of rows to add
rows_to_add = target_row_count - current_row_count

# Check if the DataFrame needs to be enlarged
if rows_to_add > 0:
    # Create a list to store the new rows
    new_rows = []

    # Generate random values for the new rows
    for _ in range(rows_to_add):
        new_row = {
            'df_index': None,  # Placeholder, will be updated later
            'prediction': random.choice(['10 foot', '4.95 millimetre', '50 centimetre']),
        }
        new_rows.append(new_row)

    # Convert the list of new rows into a DataFrame
    df_new_rows = pd.DataFrame(new_rows)

    # Concatenate the new rows with the existing DataFrame
    df_expanded = pd.concat([df, df_new_rows], ignore_index=True)

    # Update 'df_index' with the new index values
    df_expanded['df_index'] = df_expanded.index
    

    # Save the expanded DataFrame to a new CSV file
    output_csv_path = 'output_test_1_f.csv'
    df_expanded.to_csv(output_csv_path, index=False)
else:
    print(f"The DataFrame already has {current_row_count} rows or more.")

In [11]:
df_test=pd.read_csv('test.csv')
df_test=df_test['index']    
df_test

0              0
1              1
2              2
3              3
4              4
           ...  
131182    131283
131183    131284
131184    131285
131185    131286
131186    131287
Name: index, Length: 131187, dtype: int64

In [17]:
number_of_rows = len(df_test)
for i in range(number_of_rows):
    index = df_test.iloc[i]['df_index']  # This line is causing the error
    # Correct way to access the column value in the row:
    index = df_test.iloc[i]['df_index']  # Replace this with df_test.at[i, 'df_index'] or df_test.iloc[i, df_test.columns.get_loc('df_index')]
    df_test.loc[df_test['df_index'] == index, 'prediction'] = '10 foot'
df_test.to_csv('1_test.csv', index=False)

IndexError: invalid index to scalar variable.