In [None]:
import pandas as pd
import numpy as np
train = pd.read_csv("/content/train_merged.csv")
test = pd.read_csv("/content/test_merged.csv")
sub = pd.read_csv("/content/submission_1.csv")

In [None]:
sub.iloc[10:20]

Unnamed: 0,index,prediction
10,10,
11,11,30.6 centimetre
12,12,4.3 inch
13,13,4.3 inch
14,14,4.3 inch
15,15,
16,16,
17,17,
18,18,
19,19,240.0 volt


###EDA

In [None]:
test.head()

Unnamed: 0.1,Unnamed: 0,index,image_link,group_id,entity_name,extracted_text
0,0,0,110EibNyclL.jpg,156839,height,2.63in 6.68cm 91.44cm - 199.39cm 36in - 78in
1,1,1,11TU2clswzL.jpg,792578,width,"Size Width Length One Size 42cm/16.54"" 200cm/7..."
2,2,2,11TU2clswzL.jpg,792578,height,"Size Width Length One Size 42cm/16.54"" 200cm/7..."
3,3,3,11TU2clswzL.jpg,792578,depth,"Size Width Length One Size 42cm/16.54"" 200cm/7..."
4,4,4,11gHj8dhhrL.jpg,792578,depth,"Size Width Length One Size 10.50cm/4.13"" 90cm/..."


In [None]:
test.shape

(131187, 6)

In [None]:
test.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
index,0
image_link,0
group_id,0
entity_name,0
extracted_text,1265


In [None]:
sub.shape

(131187, 2)

In [None]:
sub.isnull().sum()

Unnamed: 0,0
index,0
prediction,39764


###PreProcessing

In [None]:
df = test
import re

# Function to build the pattern for a list of units
def build_pattern(units):
    # Sort units by length descending to match longer units first
    units_sorted = sorted(units, key=lambda x: -len(x))
    # Escape units that have special regex characters
    units_escaped = [re.escape(u) for u in units_sorted]
    # Build the unit pattern, adding word boundaries
    units_pattern = '|'.join(units_escaped)
    # Build the final pattern
    pattern = r"(\d+\.?\d*)\s?(" + units_pattern + r")\b"
    return pattern

# Define units per entity
weight_units = ['milligram', 'mg', 'miligram', 'milligrams', 'miligrammes',
                'gram', 'g', 'grams',
                'kilogram', 'kg',
                'microgram', 'µg',
                'ounce', 'oz',
                'pound', 'lb',
                'ton']

length_units = ['millimetre', 'mm', 'millimeter',
                'centimetre', 'cm', 'centimeter',
                'metre', 'meter', r'm(?!m)',  # Negative lookahead to prevent matching 'mm' as 'm'
                'foot', 'ft',
                'inch', 'in',
                'yard', 'yd']

voltage_units = ['kilovolt', 'kV', 'kv',
                 'millivolt', 'mV', 'mv',
                 'volt', 'V', 'v']

wattage_units = ['kilowatt', 'kW', 'kw', 'watt', 'W', 'w']

volume_units = ['centilitre', 'cl',
                'cubic foot', 'ft³',
                'cubic inch', 'in³',
                'cup',
                'decilitre', 'dl',
                'fluid ounce', 'fl oz',
                'gallon', 'imperial gallon',
                'litre', 'liter',
                'millilitre', 'ml', 'milliliter',
                'microlitre', 'microliter',
                'pint', 'quart']

# Define the extraction patterns for each entity type
patterns = {
    'item_weight': build_pattern(weight_units),
    'depth': build_pattern(length_units),
    'width': build_pattern(length_units),
    'height': build_pattern(length_units),
    'voltage': build_pattern(voltage_units),
    'wattage': build_pattern(wattage_units),
    'item_volume': build_pattern(volume_units),
    'maximum_weight_recommendation': build_pattern(weight_units)
}

# Normalize unit names (ensure all keys are in lowercase)
unit_mappings = {
    # weight units
    'mg': 'milligram', 'milligram': 'milligram', 'miligram': 'milligram',
    'milligrams': 'milligram', 'miligrammes': 'milligram',
    'g': 'gram', 'gram': 'gram', 'grams': 'gram',
    'kg': 'kilogram', 'kilogram': 'kilogram',
    'µg': 'microgram', 'microgram': 'microgram',
    'oz': 'ounce', 'ounce': 'ounce',
    'lb': 'pound', 'pound': 'pound',
    'ton': 'ton',

    # length units
    'cm': 'centimetre', 'centimetre': 'centimetre', 'centimeter': 'centimetre',
    'mm': 'millimetre', 'millimetre': 'millimetre', 'millimeter': 'millimetre',
    'm': 'metre', 'metre': 'metre', 'meter': 'metre',
    'foot': 'foot', 'ft': 'foot', 'inch': 'inch', 'in': 'inch',
    'yard': 'yard', 'yd': 'yard',

    # voltage units
    'kv': 'kilovolt', 'kilovolt': 'kilovolt',
    'mv': 'millivolt', 'millivolt': 'millivolt',
    'v': 'volt', 'volt': 'volt',

    # wattage units
    'kw': 'kilowatt', 'kilowatt': 'kilowatt',
    'w': 'watt', 'watt': 'watt',

    # volume units
    'cl': 'centilitre', 'centilitre': 'centilitre',
    'dl': 'decilitre', 'decilitre': 'decilitre',
    'ft³': 'cubic foot', 'cubic foot': 'cubic foot',
    'in³': 'cubic inch', 'cubic inch': 'cubic inch',
    'cup': 'cup',
    'fluid ounce': 'fluid ounce', 'fl oz': 'fluid ounce',
    'gallon': 'gallon', 'imperial gallon': 'imperial gallon',
    'litre': 'litre', 'liter': 'litre',
    'ml': 'millilitre', 'millilitre': 'millilitre', 'milliliter': 'millilitre',
    'microlitre': 'microlitre', 'microliter': 'microlitre',
    'pint': 'pint', 'quart': 'quart'
}

# Function to extract and normalize quantity and unit from text
def extract_quantity(row):
    text = row['extracted_text']
    entity_name = row['entity_name']

    # Skip if text is None
    if pd.isna(text):
        return ''

    pattern = patterns.get(entity_name)
    if pattern:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            value, unit = match.groups()
            value = float(value)
            normalized_unit = unit_mappings.get(unit.lower(), unit.lower())
            return f"{value} {normalized_unit}"

    return ''

# # Function to check if extracted text matches the entity value
# def check_match(row):
#     extracted = row['extracted_text']
#     entity_value = row['entity_value']

#     return extracted == entity_value

# Apply the function to the DataFrame
df['extracted_text'] = df.apply(extract_quantity, axis=1)
# df['match'] = df.apply(check_match, axis=1)

In [None]:
test = df
test.head()

Unnamed: 0.1,Unnamed: 0,index,image_link,group_id,entity_name,extracted_text
0,0,0,110EibNyclL.jpg,156839,height,2.63 inch
1,1,1,11TU2clswzL.jpg,792578,width,42.0 centimetre
2,2,2,11TU2clswzL.jpg,792578,height,42.0 centimetre
3,3,3,11TU2clswzL.jpg,792578,depth,42.0 centimetre
4,4,4,11gHj8dhhrL.jpg,792578,depth,10.5 centimetre


In [None]:
df[test['extracted_text'] == ''].count()

Unnamed: 0,0
Unnamed: 0,39764
index,39764
image_link,39764
group_id,39764
entity_name,39764
extracted_text,39764


###KNN

In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def impute_predicted_knn(df, predicted_col='predicted', n_neighbors=1):
    """
    Imputes missing values in the specified 'predicted' column using KNN.
    The imputed values are exact copies from existing entries to maintain consistency.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing the data.
    - predicted_col (str): The name of the column to impute.
    - n_neighbors (int): Number of neighbors to use for KNN.

    Returns:
    - pd.DataFrame: DataFrame with imputed 'predicted' values.
    """
    # Check if the predicted column exists
    if predicted_col not in df.columns:
        raise ValueError(f"Column '{predicted_col}' not found in DataFrame.")

    # Separate rows with and without the predicted value
    df_notnull = df[df[predicted_col].notnull()].copy()
    df_null = df[df[predicted_col].isnull()].copy()

    # If there are no missing values, return the original DataFrame
    if df_null.empty:
        print("No missing values to impute.")
        return df

    # Features are all columns except the predicted column
    X_train = df_notnull.drop(columns=[predicted_col])
    y_train = df_notnull[predicted_col]
    X_test = df_null.drop(columns=[predicted_col])

    # Identify numeric and categorical columns
    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

    # Define preprocessing for numeric features
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean'))
    ])

    # Define preprocessing for categorical features
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Create a pipeline that first preprocesses the data and then applies KNN
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('knn', KNeighborsClassifier(n_neighbors=n_neighbors))
    ])

    # Fit the pipeline on the non-missing data
    pipeline.fit(X_train, y_train)

    # Predict the missing 'predicted' values
    y_pred = pipeline.predict(X_test)

    # Assign the predicted values back to the original DataFrame
    df.loc[df[predicted_col].isnull(), predicted_col] = y_pred

    return df


df = sub

print("Before Imputation:")
print(df)

# Perform KNN imputation
df_imputed = impute_predicted_knn(df, predicted_col='prediction')

print("\nAfter Imputation:")
print(df_imputed)


Before Imputation:
         index       prediction
0            0        2.63 inch
1            1  42.0 centimetre
2            2  42.0 centimetre
3            3  42.0 centimetre
4            4  10.5 centimetre
...        ...              ...
131182  131283      500.0 pound
131183  131284              NaN
131184  131285              NaN
131185  131286              NaN
131186  131287              NaN

[131187 rows x 2 columns]

After Imputation:
         index       prediction
0            0        2.63 inch
1            1  42.0 centimetre
2            2  42.0 centimetre
3            3  42.0 centimetre
4            4  10.5 centimetre
...        ...              ...
131182  131283      500.0 pound
131183  131284      500.0 pound
131184  131285      500.0 pound
131185  131286      500.0 pound
131186  131287      500.0 pound

[131187 rows x 2 columns]


In [None]:
df_imputed[1000:1050]

Unnamed: 0,index,prediction
1000,1001,120.0 volt
1001,1002,16.0 pound
1002,1003,3.74 inch
1003,1004,3.74 inch
1004,1005,3.74 inch
1005,1006,61.0 centimetre
1006,1007,61.0 centimetre
1007,1008,0.38 pound
1008,1009,31.9 inch
1009,1010,8.5 inch


In [None]:
df_imputed.isnull().sum()

Unnamed: 0,0
index,0
prediction,0


In [None]:
df_imputed

Unnamed: 0,index,prediction
0,0,2.63 inch
1,1,42.0 centimetre
2,2,42.0 centimetre
3,3,42.0 centimetre
4,4,10.5 centimetre
...,...,...
131182,131283,500.0 pound
131183,131284,500.0 pound
131184,131285,500.0 pound
131185,131286,500.0 pound


In [None]:
y_pred = df['prediction']
index = df['index']


submission = pd.DataFrame({
    'index': index, # Using index as ID
    'prediction': y_pred
})

submission.to_csv('submission_3.csv', index=False)

###Final Try

In [None]:
import pandas as pd
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")

In [None]:
train['image_link'].head()

Unnamed: 0,image_link
0,https://m.media-amazon.com/images/I/61I9XdN6OF...
1,https://m.media-amazon.com/images/I/71gSRbyXmo...
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...
3,https://m.media-amazon.com/images/I/612mrlqiI4...
4,https://m.media-amazon.com/images/I/617Tl40LOX...


In [None]:
test['image_link'].head()

Unnamed: 0,image_link
0,https://m.media-amazon.com/images/I/110EibNycl...
1,https://m.media-amazon.com/images/I/11TU2clswz...
2,https://m.media-amazon.com/images/I/11TU2clswz...
3,https://m.media-amazon.com/images/I/11TU2clswz...
4,https://m.media-amazon.com/images/I/11gHj8dhhr...


In [None]:
# Convert the 'image_link' columns to sets to find common values
train_image_links = set(train['image_link'])
test_image_links = set(test['image_link'])

# Find the intersection of both sets to get the common values
common_image_links = train_image_links.intersection(test_image_links)

# Get the number of common values
num_common_links = len(common_image_links)

print(f"Number of common 'image_link' values: {num_common_links}")


Number of common 'image_link' values: 0
