In [None]:
pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.10.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.14.0 stanza-1.10.1


In [None]:
import re
import pandas as pd
import stanza
from sklearn.utils import resample

In [None]:
stanza.download('te')
nlp = stanza.Pipeline('te', processors='tokenize')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: te (Telugu) ...


Downloading https://huggingface.co/stanfordnlp/stanza-te/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/te/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: te (Telugu):
| Processor | Package |
-----------------------
| tokenize  | mtg     |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Done loading processors!


In [None]:
nltk.download('punkt')
nltk.data.path.append('/root/nltk_data')

# File paths
file_path = '/content/drive/MyDrive/TE-AT-train.xlsx - Sheet1.csv'
test_file_path = '/content/drive/MyDrive/TE-AT-test.xlsx - Sheet1.csv'

# Load training data
data = pd.read_csv(file_path)

# Preprocessing Telugu text
def preprocess_telugu_text(text):
    # Step 1: Normalize Unicode
    text = re.sub(r'\u200c', '', text)  # Remove Zero-Width Non-Joiner (ZWNJ) if present

    # Step 2: Remove non-Telugu characters, special characters, and numbers
    text = re.sub(r'[^\u0C00-\u0C7F\s]', '', text)  # Retain only Telugu script and spaces
    text = re.sub(r'\d+', '', text)  # Remove numeric values

    # Step 3: Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Step 4: Handle commonly used spoken variants (expanded replacements)
    replacements = {
        "ఐ": "అయి",  # Normalize vowels
        "ఔ": "అవు",
        "ఎఁ": "ఎ",
        "ఒఁ": "ఒ",
        "ఘ": "గ",  # Simplify aspirated consonants
        "ఛ": "చ",
        "ఝ": "జ",
        "థ": "త",
        "ధ": "ద",
        "ఱ": "ర",  # Normalize rare characters to common ones
        "ఋ": "రూ",  # Normalize vowels
        "ౠ": "రూ",
        "క్ష": "క",  # Normalize compound consonants
        "జ్ఞ": "జ",
        "ఙ": "న"  # Simplify nasals
    }
    for key, value in replacements.items():
        text = text.replace(key, value)

    return text

# Tokenize Telugu text using NLTK
def tokenize_text(text):
    doc = nlp(text)  # Tokenize the text using Stanza
    return ' '.join([word.text for sentence in doc.sentences for word in sentence.words])

# Apply preprocessing and tokenization to the data
data['Transcript'] = data['Transcript'].apply(preprocess_telugu_text)
data['Transcript'] = data['Transcript'].apply(tokenize_text)

# Remove LabelEncoder and keep the 'Class Label Short' as is (no numeric encoding)
# No transformation of labels is needed anymore
# Now, we will use 'Class Label Short' as the label

# Upsample the data to balance class distribution
class_counts = data['Class Label Short'].value_counts()
max_class_size = class_counts.max()
upsampled_data = []

for label in class_counts.index:
    class_data = data[data['Class Label Short'] == label]
    # Upsample to the maximum class size
    upsampled_class_data = resample(class_data, replace=True, n_samples=max_class_size, random_state=42)
    upsampled_data.append(upsampled_class_data)

balanced_data = pd.concat(upsampled_data)
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the preprocessed training data
dataset = balanced_data[["Transcript", "Class Label Short"]]
dataset.columns = ["Text", "Label"]
dataset.to_csv('/content/drive/MyDrive/processed_train_data_telugu1.csv', index=False)

# Load and preprocess test data
test_data = pd.read_csv(test_file_path)
test_data['Transcript'] = test_data['Transcript'].apply(preprocess_telugu_text)
test_data['Transcript'] = test_data['Transcript'].apply(tokenize_text)

# Save the preprocessed test data
dataset_t = test_data[["Transcript"]]
dataset_t.to_csv('/content/drive/MyDrive/processed_test_data_telug.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
