Label a Subset of Dataset in CoNLL Format


In [1]:
import pandas as pd
import os

# Load your dataset
df = pd.read_csv('../data/cleaned_telegram_data.csv')

# Display the first few rows of the dataset to understand its structure
print("Dataset Preview:")
print(df.head())


Dataset Preview:
         Channel Title    Channel Username    ID  \
0  Sheger online-store  @Shageronlinestore  5328   
1  Sheger online-store  @Shageronlinestore  5327   
2  Sheger online-store  @Shageronlinestore  5326   
3  Sheger online-store  @Shageronlinestore  5325   
4  Sheger online-store  @Shageronlinestore  5323   

                                             Message  \
0  💥3pcs silicon brush spatulas\n\n⚡እስከ 260°c ሙቀት...   
1  💥Mandoline Slicer\n\n👉 ጊዜ ቆጣቢ ስላይስ ማድረጊያ \n👉  ...   
2  💥Table Desk Edge Guard Strip\n       💯 High Qu...   
3  💥Table Desk Edge Guard Strip\n       💯 High Qu...   
4  💥Only baby 3in1 double bottle milk warmer,ster...   

                        Date                          Media Path  \
0  2024-09-20 11:50:02+00:00  photos/@Shageronlinestore_5328.jpg   
1  2024-09-20 08:11:40+00:00                                 NaN   
2  2024-09-20 05:23:18+00:00                                 NaN   
3  2024-09-20 05:21:14+00:00  photos/@Shageronlinestore_5325.

In [2]:
# Function to label entities in a message
def label_entities(message):
    labeled_data = []
    
    # Sample tokenization and labeling logic (you may need to refine this)
    tokens = message.split()
    for token in tokens:
        # Example conditions for labeling (customize this)
        if "ብር" in token or "ዋጋ" in token:
            if token.startswith("ዋጋ"):
                labeled_data.append((token, "B-PRICE"))
            else:
                labeled_data.append((token, "I-PRICE"))
        elif token in ["አዲስ", "አበባ"]:  # Example location keywords
            labeled_data.append((token, "B-LOC"))
        elif token in ["መሸጫ", "ተመን"]:  # Example product keywords
            labeled_data.append((token, "B-PRODUCT"))
        else:
            labeled_data.append((token, "O"))
    
    return labeled_data


In [4]:
# Label a subset of the dataset (30-50 messages)
subset_size = min(50, len(df))  # Adjust based on the dataset size
coNLL_lines = []

for idx in range(subset_size):
    message = df.iloc[idx]['Message']
    labeled_message = label_entities(message)
    
    for token, label in labeled_message:
        coNLL_lines.append(f"{token} {label}")
    coNLL_lines.append("")  # Add a blank line to separate sentences/messages


In [7]:
# Display a sample of the CoNLL-formatted data
print("Sample CoNLL Data:")
for line in coNLL_lines[:10]:  # Display the first 10 lines for brevity
    print(line)


Sample CoNLL Data:
💥3pcs O
silicon O
brush O
spatulas O
⚡እስከ O
260°c O
ሙቀት O
መቆቆም O
የሚችል O
ዋጋ-550ብር✅ B-PRICE
🏢 O
አድራሻ O
ቁ.1👉 O
ስሪ O
ኤም O
ሲቲ O
ሞል O
ሁለተኛ O
ፎቅ O
ቢሮ O


In [5]:
# Save the labeled data in CoNLL format to a plain text file
output_file_path = '../data/labeled_data.conll'
with open(output_file_path, 'w', encoding='utf-8') as f:
    for line in coNLL_lines:
        f.write(line + '\n')

print(f"Labeled data saved to: {output_file_path}")


Labeled data saved to: ../data/labeled_data.conll
