In [3]:
import re
import pandas as pd
import nltk
from sklearn.utils import resample

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# File paths
file_path = '/content/drive/MyDrive/ML-AT-train.xlsx'
test_file_path = '/content/drive/MyDrive/ML-AT-test.xlsx'

# Load training data
data = pd.read_excel(file_path)

# Preprocessing Malayalam text
def preprocess_malayalam_text(text):
    # Step 1: Normalize Unicode
    text = re.sub(r'\u200c', '', text)  # Remove Zero-Width Non-Joiner (ZWNJ) if present

    # Step 2: Remove non-Malayalam characters, special characters, and numbers
    text = re.sub(r'[^\u0D00-\u0D7F\s]', '', text)  # Retain only Malayalam script and spaces
    text = re.sub(r'\d+', '', text)  # Remove numeric values

    # Step 3: Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Step 4: Handle commonly used spoken variants (expanded replacements)
    replacements = {
        "ഏ": "എ",  # Normalize vowels
        "ഓ": "ഒ",
        "കൌ": "കോ",  # Normalize common diphthongs
        "ചൌ": "ചോ",
        "പൌ": "പോ",
        "കെ": "കേ",
        "ചെ": "ചേ",
        "ടെ": "ടേ",
        "തെ": "തേ",
        "നെ": "നേ",
        "പെ": "പേ",
        "മെ": "മേ",
        "വെ": "വേ",
        "ലെ": "ലേ",
        "റ്റെ": "റ്റേ",
        "ണെ": "ണേ",
        "ഇ": "എ",  # Normalize short vowels
        "ഉ": "ഒ",
        "ക്ഷ": "ക",  # Normalize compound consonants
        "ജ": "ച"
    }
    for key, value in replacements.items():
        text = text.replace(key, value)

    return text

# Tokenize Malayalam text using NLTK
def tokenize_text(text):
    # Tokenize the text using NLTK
    tokens = nltk.word_tokenize(text)
    return ' '.join(tokens)

# Apply preprocessing and tokenization to the data
data['Transcript'] = data['Transcript'].apply(preprocess_malayalam_text)
data['Transcript'] = data['Transcript'].apply(tokenize_text)

# Remove LabelEncoder and keep the 'Class Label Short' as is (no numeric encoding)
# No transformation of labels is needed anymore
# Now, we will use 'Class Label Short' as the label

# Upsample the data to balance class distribution
class_counts = data['Class Label Short'].value_counts()
max_class_size = class_counts.max()
upsampled_data = []

for label in class_counts.index:
    class_data = data[data['Class Label Short'] == label]
    # Upsample to the maximum class size
    upsampled_class_data = resample(class_data, replace=True, n_samples=max_class_size, random_state=42)
    upsampled_data.append(upsampled_class_data)

balanced_data = pd.concat(upsampled_data)
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the preprocessed training data
dataset = balanced_data[["Transcript", "Class Label Short"]]
dataset.columns = ["Text", "Label"]
dataset.to_csv('/content/drive/MyDrive/processed_train_data.csv', index=False)

# Load and preprocess test data
test_data = pd.read_excel(test_file_path)
test_data['Transcript'] = test_data['Transcript'].apply(preprocess_malayalam_text)
test_data['Transcript'] = test_data['Transcript'].apply(tokenize_text)

# Save the preprocessed test data
dataset_t = test_data[["Transcript"]]
dataset_t.to_csv('/content/drive/MyDrive/processed_test_data.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
