In [4]:
!pip install pandas numpy scikit-learn torch transformers matplotlib seaborn


Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load your Hindi datasets
dataset1 = pd.read_csv('datasets/hindi/hasoc_hindi_dataset/hasoc2019_hi_test_gold_2919.tsv', sep='\t')  # Adjust separator if needed
dataset2 = pd.read_csv('datasets/hindi/hasoc_hindi_dataset/hindi_dataset.tsv', sep='\t')  # Adjust separator if needed

# Combine datasets
df_hindi = pd.concat([dataset1, dataset2], ignore_index=True)

print("Dataset shape:", df_hindi.shape)
print("\nFirst few rows:")
print(df_hindi.head())

# Clean column names (remove extra spaces)
df_hindi.columns = df_hindi.columns.str.strip()

# Let's check the columns
print("\nDataset columns:", df_hindi.columns.tolist())

# Define mapping from task labels to our classes
def map_to_classes(row):
    """
    Map from task_1 and task_2 labels to ['hate', 'normal', 'offensive']
    
    Based on your examples:
    - NOT + NONE → normal
    - HOF + PRFN → hate (profane)
    - HOF + OFFN → offensive
    - HOF + NONE → hate (default for HOF without specific task_2)
    """
    task1 = str(row['task_1']).strip().upper() if 'task_1' in row else 'NOT'
    task2 = str(row['task_2']).strip().upper() if 'task_2' in row else 'NONE'
    
    if task1 == 'NOT':
        return 'normal'
    elif task1 == 'HOF':
        if task2 == 'PRFN':
            return 'hate'
        elif task2 == 'OFFN':
            return 'offensive'
        else:  # HOF with NONE or other
            return 'hate'  # Default to hate for HOF
    else:
        # Default for any other cases
        return 'normal'

# Apply mapping
df_hindi['label'] = df_hindi.apply(map_to_classes, axis=1)

# Clean text column (remove URLs, special characters, etc.)
def clean_hindi_text(text):
    if pd.isna(text):
        return ""
    
    text = str(text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove mentions and hashtags (keep the text after #)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    
    # Remove RT (retweet)
    text = re.sub(r'RT\s+', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

import re

# Apply cleaning
df_hindi['clean_text'] = df_hindi['text'].apply(clean_hindi_text)

# Check label distribution
print("\nHindi dataset label distribution:")
label_counts = df_hindi['label'].value_counts()
print(label_counts)

print(f"\nNumber of unique labels: {df_hindi['label'].nunique()}")
print(f"Unique labels: {df_hindi['label'].unique()}")

# Check for empty texts
empty_texts = df_hindi[df_hindi['clean_text'].str.strip() == '']
print(f"\nRows with empty text: {len(empty_texts)}")

# Remove empty texts if any
df_hindi = df_hindi[df_hindi['clean_text'].str.strip() != '']

# Shuffle the dataset
print("\nShuffling the dataset...")
df_shuffled = shuffle(df_hindi, random_state=42).reset_index(drop=True)

print(f"Final dataset size: {len(df_shuffled)}")

# Map labels to numerical values
unique_labels = sorted(df_shuffled['label'].unique())
print(f"\nUnique labels in dataset: {unique_labels}")

label_map = {label: idx for idx, label in enumerate(unique_labels)}
reverse_label_map = {idx: label for label, idx in label_map.items()}

print(f"Label mapping: {label_map}")

df_shuffled['label_encoded'] = df_shuffled['label'].map(label_map)

# Check distribution
print("\nFinal label distribution:")
print(df_shuffled['label'].value_counts())
print(f"\nEncoded label distribution:")
print(df_shuffled['label_encoded'].value_counts())

  from .autonotebook import tqdm as notebook_tqdm


Dataset shape: (5983, 5)

First few rows:
         text_id                                               text task_1  \
0  hasoc_hi_5061  वक्त, इन्सान और इंग्लैंड का मौसम आपको कभी भी ध...    NOT   
1  hasoc_hi_2090  #कांग्रेस के इस #कमीने की #करतूत को देखिए देश ...    HOF   
2  hasoc_hi_2960  पाकिस्तान को फेकना था फेका गया। जो हार कर भी द...    HOF   
3   hasoc_hi_864  जो शब्द तूम आज किसी और औरत के लिए यूज कर रहे व...    NOT   
4    hasoc_hi_54  नेता जी हम समाजवादी सिपाही हमेशा आपके साथ है आ...    NOT   

  task_2 task_3  
0   NONE   NONE  
1   OFFN    TIN  
2   OFFN    TIN  
3   NONE   NONE  
4   NONE   NONE  

Dataset columns: ['text_id', 'text', 'task_1', 'task_2', 'task_3']

Hindi dataset label distribution:
label
normal       2909
hate         2201
offensive     873
Name: count, dtype: int64

Number of unique labels: 3
Unique labels: ['normal' 'offensive' 'hate']

Rows with empty text: 2

Shuffling the dataset...
Final dataset size: 5981

Unique labels in dataset: ['hate', 'normal

In [2]:

df_shuffled.to_csv('hindi_processed_data.csv', index=False)


In [3]:
df_shuffled.head()


Unnamed: 0,text_id,text,task_1,task_2,task_3,label,clean_text,label_encoded
0,hasoc_hi_3825,आज ये बात पक्की हो गई कि #भडवा कलर sorry #भगवा...,NOT,NONE,NONE,normal,आज ये बात पक्की हो गई कि भडवा कलर sorry भगवा क...,1
1,hasoc_hi_5196,My Speech today on President Address Debate in...,NOT,NONE,NONE,normal,My Speech today on President Address Debate in...,1
2,hasoc_hi_3655,"नहीं माने राहुल: नए अध्यक्ष की तलाश शुरू, दो द...",NOT,NONE,NONE,normal,"नहीं माने राहुल: नए अध्यक्ष की तलाश शुरू, दो द...",1
3,hasoc_hi_895,कश्मीर का नाम सुनते ही आतंकवाद और पत्थरबाज,HOF,OFFN,TIN,offensive,कश्मीर का नाम सुनते ही आतंकवाद और पत्थरबाज,2
4,hasoc_hi_6627,@KaDevender मोदी जी पागल है जो जल संरक्षण के प...,HOF,PRFN,TIN,hate,मोदी जी पागल है जो जल संरक्षण के पीछे पागल हुए...,0
