In [8]:
import pandas as pd
from pathlib import Path

# Process ket_qua_similarity.csv
input_path = Path('ket_qua_similarity.csv')
output_path = input_path.with_name(input_path.stem + '_preprocessed.csv')

# Read CSV
similarity_df = pd.read_csv(input_path, encoding='utf-8-sig')

# Helper to split Item into class (first word) and product (rest)
def split_item(value: object) -> tuple[str, str]:
    if pd.isna(value):
        return '', ''
    s = str(value).strip()
    if s == '':
        return '', ''
    parts = s.split(maxsplit=1)
    if len(parts) == 1:
        return parts[0], ''
    return parts[0], parts[1]

# Transform Item_1
if 'Item1' in similarity_df.columns:
    class_prod_1 = similarity_df['Item1'].apply(split_item).apply(pd.Series)
    class_prod_1.columns = ['class_1', 'product_1']
else:
    class_prod_1 = pd.DataFrame({'class_1': [], 'product_1': []})

# Transform Item_2
if 'Item2' in similarity_df.columns:
    class_prod_2 = similarity_df['Item2'].apply(split_item).apply(pd.Series)
    class_prod_2.columns = ['class_2', 'product_2']
else:
    class_prod_2 = pd.DataFrame({'class_2': [], 'product_2': []})

# Build final dataframe: drop Item_1, Item_2, Purpose, Similarity if present
cols_to_drop = [c for c in ['Item1', 'Item2', 'Purpose', 'Similarity'] if c in similarity_df.columns]
similarity_df = similarity_df.drop(columns=cols_to_drop)

similarity_df = pd.concat([similarity_df, class_prod_1, class_prod_2], axis=1)

# Save
similarity_df.to_csv(output_path, index=False, encoding='utf-8-sig')

# Preview
similarity_df.head()


Unnamed: 0,Nature,class_1,product_1,class_2,product_2
0,4.0,1,Chemicals for industrial use,1,Chemical additives for detergents
1,4.0,1,Chemical products used in the manufacture of p...,1,"Plastics in the form of granules, powders, mas..."
2,4.0,1,Polymers,1,Unprocessed plastics
3,0.0,2,Paints,19,Construction materials
4,0.0,3,Cosmetics,1,chemical additives for detergents


In [9]:

# Drop rows with empty/NaN Nature and re-save
if 'Nature' in similarity_df.columns:
    non_empty_nature = (~similarity_df['Nature'].isna()) & (similarity_df['Nature'].astype(str).str.strip() != '')
    similarity_df = similarity_df[non_empty_nature].copy()
    similarity_df.to_csv(output_path, index=False, encoding='utf-8-sig')

# Preview after filtering
similarity_df.head()
similarity_df.to_csv(output_path, index=False, encoding='utf-8-sig')
