<a href="https://colab.research.google.com/github/rajavavek/DAugSindhi/blob/main/train_sindhi_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install simpletransformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import random

def swap_words(sentence):
    words = sentence.split()
    if len(words) < 2:
        return sentence
    idx1, idx2 = random.sample(range(len(words)), 2)
    words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

def rs(df):
  # Augment the dataframe by swapping words in each row
  augmented_df = df.copy()
  augmented_df['text'] = augmented_df['text'].apply(swap_words)
  return augmented_df

def remove_word(sentence):
    words = sentence.split()
    if len(words) < 2:
        return sentence
    word_to_remove = random.choice(words)
    words.remove(word_to_remove)
    return ' '.join(words)

def rd(df):
  # Augment the dataframe by swapping words in each row
  augmented_df = df.copy()
  augmented_df['text'] = augmented_df['text'].apply(remove_word)
  return augmented_df


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def llm_augment_text(prompt, num_augmentations=1, length=100, temperature=0.8):
    # Load the GPT-2 model and tokenizer
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # Tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Generate text augmentations
    output = model.generate(
        input_ids=input_ids,
        do_sample=True,
        max_length=length,
        num_return_sequences=num_augmentations,
        temperature=temperature
    )

    # Decode the generated output into text
    augmentations = []
    for text in output:
        decoded_text = tokenizer.decode(text, skip_special_tokens=True)
        augmentations.append(decoded_text)

    return augmentations[0]

def llm_expand(df):
  # Augment the dataframe by swapping words in each row
  augmented_df = df.copy()
  augmented_df['text'] = augmented_df['text'].apply(llm_augment_text)
  return augmented_df

In [None]:
import pandas as pd

# Read the Excel file
excel_file = pd.ExcelFile('/content/drive/MyDrive/256_PROJECT/256_input.xlsx')

# Create an empty dictionary to store the dataframes
dfs = {}

# Iterate over each sheet in the Excel file
for sheet_name in excel_file.sheet_names:
    # Read the sheet as a dataframe
    df = excel_file.parse(sheet_name)
    # Store the dataframe in the dictionary
    dfs[sheet_name] = df

# Access the dataframes by sheet name
for sheet_name, df in dfs.items():
    print(f"Sheet Name: {sheet_name}")
    print()


In [None]:
del dfs["Urdu - 3 class(Train)"]
del dfs["Urdu - 3 class(Test)"]

In [None]:
for sheet_name, df in dfs.items():
    print(f"Sheet Name: {sheet_name}")
    print()

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

def preprocess(train_name, test_name):
  train = dfs[train_name]
  test = dfs[test_name]

  train = train.dropna()
  test = test.dropna()

  train.reset_index(drop=True, inplace=True)
  test.reset_index(drop=True, inplace=True)

  label_encoder = LabelEncoder()

  train['label_encoded'] = label_encoder.fit_transform(train['label'])
  test['label_encoded'] = label_encoder.fit_transform(test['label'])

  train.drop('label', axis=1, inplace=True)
  test.drop('label', axis=1, inplace=True)

  train.rename(columns={'label_encoded': 'label'}, inplace=True)
  test.rename(columns={'label_encoded': 'label'}, inplace=True)

  train['label'] = train['label'].astype(str)
  test['label'] = test['label'].astype(str)

  train['text'] = train['text'].astype(str)
  test['text'] = test['text'].astype(str)

  return train, test

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def calculate_metrics(actual, predicted):
    accuracy = accuracy_score(actual, predicted)
    precision = precision_score(actual, predicted, average='macro')
    recall = recall_score(actual, predicted, average='macro')
    f1_macro = f1_score(actual, predicted, average='macro')
    f1_micro = f1_score(actual, predicted, average='micro')

    return(accuracy, precision, recall, f1_macro, f1_micro)

In [None]:
import traceback

def do_it(train_name, test_name):
  try:
    from simpletransformers.classification import ClassificationModel, ClassificationArgs

    train, test = preprocess(train_name, test_name)

    model_args = ClassificationArgs()
    model_args.num_train_epochs = 3
    model_args.train_batch_size = 16
    model_args.eval_batch_size = 32
    model_args.labels_list = list(train["label"].unique())
    model_args.max_seq_length = 512
    model_args.overwrite_output_dir = True
    # Add more configuration options as needed

    model = ClassificationModel('bert', 'bert-base-multilingual-cased', num_labels = len(train["label"].unique()), args=model_args, use_cuda=True)
    model.train_model(train, eval_df=test)
    predictions, raw_outputs = model.predict(test["text"].values.tolist())
    print("\n\n================================\n")
    print(f"W/O: {calculate_metrics(test['label'].values.tolist(), predictions)}")

    aug_df = rs(train)
    train = train.append(aug_df)
    model = ClassificationModel('bert', 'bert-base-multilingual-cased', num_labels = len(train["label"].unique()), args=model_args, use_cuda=True)
    model.train_model(train, eval_df=test)
    predictions, raw_outputs = model.predict(test["text"].values.tolist())
    print("\n\n================================\n")
    print(f"W RS: {calculate_metrics(test['label'].values.tolist(), predictions)}")

    aug_df = rd(train)
    train = train.append(aug_df)
    model = ClassificationModel('bert', 'bert-base-multilingual-cased', num_labels = len(train["label"].unique()), args=model_args, use_cuda=True)
    model.train_model(train, eval_df=test)
    predictions, raw_outputs = model.predict(test["text"].values.tolist())
    print("\n\n================================\n")
    print(f"W RD: {calculate_metrics(test['label'].values.tolist(), predictions)}")

    aug_df = llm_expand(train)
    train = train.append(aug_df)
    model = ClassificationModel('bert', 'bert-base-multilingual-cased', num_labels = len(train["label"].unique()), args=model_args, use_cuda=True)
    model.train_model(train, eval_df=test)
    predictions, raw_outputs = model.predict(test["text"].values.tolist())
    print("\n\n================================\n")
    print(f"W LLM Expand: {calculate_metrics(test['label'].values.tolist(), predictions)}")



  except Exception as e:
    traceback.print_exc()


In [None]:
do_it("Sindhi - 3 Class (Train)", "Sindhi - 3 Class (Test)")

In [None]:
do_it("Sindhi - 2 Class (Train)", "Sindhi - 2 Class (Train)")