Tamil use *bert-base-multilingual-cased*


In [None]:
!pip install transformers indic-nlp-library advertools

import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import advertools as adv
import pickle

file_path = "/content/tam_training_data_hum_ai (1).csv"
dataset_df = pd.read_csv(file_path)
dataset_df.columns = ['id', 'transcript', 'class_label']
dataset_df

stopwords = list(sorted(adv.stopwords['tamil']))

def preprocess_tamil_text(text):
    """Preprocess Tamil text by normalizing, tokenizing, and removing stopwords."""
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ta")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ta"))
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)

dataset_df['cleaned_transcript'] = dataset_df['transcript'].apply(preprocess_tamil_text)
dataset_df



Unnamed: 0,id,transcript,class_label,cleaned_transcript
0,TAM_HUAI_TR_001,இந்த சோப்பின் மணம் மிகவும் புத்துணர்ச்சியூட்டு...,AI,சோப்பின் மணம் புத்துணர்ச்சியூட்டும் வகையில் .
1,TAM_HUAI_TR_002,தோலை நன்கு சுத்தம் செய்ய இது மிகவும் சிறப்பானது.,AI,தோலை நன்கு சுத்தம் செய்ய சிறப்பானது .
2,TAM_HUAI_TR_003,"இதைப் பயன்படுத்திய பிறகு, தோல் மிக மென்மையாக உ...",AI,"இதைப் பயன்படுத்திய , தோல் மென்மையாக ."
3,TAM_HUAI_TR_004,இந்த சோப்பில் இயற்கையான மூலப்பொருட்கள் பயன்படு...,AI,சோப்பில் இயற்கையான மூலப்பொருட்கள் பயன்படுத்தப்...
4,TAM_HUAI_TR_005,"சிறிது சோப்பு போதும், அதிக நுரை உருவாகிறது.",AI,"சிறிது சோப்பு போதும் , நுரை உருவாகிறது ."
...,...,...,...,...
803,TAM_HUAI_TR_804,இந்த லிப்ஸ்டிக் எனக்கு பேய் மாதிரி இருக்கு,HUMAN,லிப்ஸ்டிக் பேய் மாதிரி இருக்கு
804,TAM_HUAI_TR_805,இதே போட்டோ அழகா இருக்கு,HUMAN,இதே போட்டோ அழகா இருக்கு
805,TAM_HUAI_TR_806,சோப்பு வாசனை நல்லா இருக்கு,HUMAN,சோப்பு வாசனை நல்லா இருக்கு
806,TAM_HUAI_TR_807,எண்ணெய்ன பிசுக்கு போகவே மாட்டேங்குது,HUMAN,எண்ணெய்ன பிசுக்கு போகவே மாட்டேங்குது


In [None]:
label_encoder = LabelEncoder()
dataset_df['encoded_label'] = label_encoder.fit_transform(dataset_df['class_label'])

label_encoder_path = "tamil_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)

print(f"Label encoder saved to {label_encoder_path}")

X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)

def extract_embeddings(model_name, texts):
    """Extract embeddings for the given texts using a pre-trained transformer model."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            encoded_inputs = {key: tensor.to(device) for key, tensor in encoded_inputs.items()}
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)

X_train_embeddings = extract_embeddings("bert-base-multilingual-cased", X_train.tolist())
X_test_embeddings = extract_embeddings("bert-base-multilingual-cased", X_test.tolist())

Label encoder saved to tamil_label_encoder.pkl


In [None]:
from tensorflow.keras.utils import to_categorical

y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

model = Sequential([
    Dense(256, input_dim=X_train_embeddings.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(
    X_train_embeddings, y_train_cat,
    validation_data=(X_test_embeddings, y_test_cat),
    epochs=100, batch_size=32
)

loss, accuracy = model.evaluate(X_test_embeddings, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")

y_pred = model.predict(X_test_embeddings)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 38ms/step - accuracy: 0.7039 - loss: 0.7573 - val_accuracy: 0.9691 - val_loss: 0.2501
Epoch 2/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9304 - loss: 0.2212 - val_accuracy: 0.9691 - val_loss: 0.1763
Epoch 3/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9482 - loss: 0.1389 - val_accuracy: 0.9691 - val_loss: 0.1433
Epoch 4/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9702 - loss: 0.1017 - val_accuracy: 0.9568 - val_loss: 0.1634
Epoch 5/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9640 - loss: 0.1122 - val_accuracy: 0.8519 - val_loss: 0.3011
Epoch 6/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9678 - loss: 0.0769 - val_accuracy: 0.9074 - val_loss: 0.2364
Epoch 7/100
[1m21/21[0m [

In [None]:
from tensorflow.keras.models import load_model

model.save("tamil_classification_model.h5")
print("Model saved as 'tamil_classification_model.h5'")



Model saved as 'tamil_classification_model.h5'


Malayalam use *bert-base-multilingual-cased*

In [None]:

file_path = "/content/mal_training_data_hum_ai.csv"
dataset_df = pd.read_csv(file_path)
dataset_df.columns = ['id', 'transcript', 'class_label']
dataset_df

stop = [
    "അവൻ", "അവൾ", "അവർ", "ആ", "ആകാം", "ആകുന്നു", "ആകും", "ആകെയുള്ള", "ആകെയുള്ളത്", "ആകെയുള്ളവ", "ആകെയുള്ളവർ",
    "ആകെയുള്ളവൻ", "ആകെയുള്ളവൾ", "ആകെയുള്ളവൾക്ക്", "ആകുള്ളവൾക്ക്‌", "ഇത്", "ഇതിൽ", "ഇതിന്റെ", "ഇതും", "ഇതെല്ലാം",
    "ഇവ", "ഇവയിൽ", "ഇവയുടെ", "ഇവയും", "ഇവയെല്ലാം", "ഇവൻ", "ഇവൾ", "ഇവർ", "ഇവരുടെ", "ഇവരിൽ", "ഇവരെയും", "ഇവരെയെല്ലാം",
    "ഇവരോട്", "ഇവരോടും", "ഇവരോടുള്ള", "ഇവരോടുള്ളത്", "ഇവരോടുള്ളവ", "ഇവരോടുള്ളവർ", "ഇവരോടുള്ളവൻ", "ഇവരോടുള്ളവൾ",
    "ഇവരോടുള്ളവൾക്ക്", "ഇവരോടുള്ളവൾക്ക്‌"
]

def preprocess_malayalam_text(text):
    """Preprocess Malayalam text by normalizing, tokenizing, and removing stopwords."""
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ml")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ml"))
    tokens = [token for token in tokens if token not in stop]
    return ' '.join(tokens)

dataset_df['cleaned_transcript'] = dataset_df['transcript'].apply(preprocess_malayalam_text)

label_encoder = LabelEncoder()
dataset_df['encoded_label'] = label_encoder.fit_transform(dataset_df['class_label'])
dataset_df

label_encoder_path = "mal_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")

Label encoder saved to mal_label_encoder.pkl


In [None]:
dataset_df

Unnamed: 0,id,transcript,class_label,cleaned_transcript,encoded_label
0,MAL_HUAI_TR_001,ഞാൻ കുറച്ച് കാലമായി മുച്ചട്ച്ചിൻ്റെ ഫേസ് വാഷ് ...,HUMAN,ഞാൻ കുറച്ച് കാലമായി മുച്ചട്ച്ചിൻ്റെ ഫേസ് വാഷ് ...,1
1,MAL_HUAI_TR_002,ഈ ഫേസ് വാഷ് തണുപ്പ് വെതറിലും ഉപയോഗിക്കാം,HUMAN,ഈ ഫേസ് വാഷ് തണുപ്പ് വെതറിലും ഉപയോഗിക്കാം,1
2,MAL_HUAI_TR_003,അണ്ണാ എനിക്ക് 14 വയസ് ആയ തേയോളു എനിക്ക് സ്കിൻക...,HUMAN,അണ്ണാ എനിക്ക് 14 വയസ് ആയ തേയോളു എനിക്ക് സ്കിൻക...,1
3,MAL_HUAI_TR_004,ബ്രോ ഇതെല്ലം യൂസ് ആക്കീട്ട് നൈറ്റ് പിന്നെ വേറ...,HUMAN,ബ്രോ ഇതെല്ലം യൂസ് ആക്കീട്ട് നൈറ്റ് പിന്നെ വേറെ...,1
4,MAL_HUAI_TR_005,ഇത് ഫേസ് വാഷ് ഡെയിലി ചെയ്താ സ്കിൻകെയറിന് നല്ലതാ,HUMAN,ഫേസ് വാഷ് ഡെയിലി ചെയ്താ സ്കിൻകെയറിന് നല്ലതാ,1
...,...,...,...,...,...
795,MAL_HUAI_TR_796,"ബിരിയാണി, പപ്പടം, അച്ചാർ - മറ്റെവിടെയും കിട്ടാ...",AI,"ബിരിയാണി , പപ്പടം , അച്ചാർ - മറ്റെവിടെയും കിട്...",0
796,MAL_HUAI_TR_797,"എങ്കിലും, തട്ടുകടയിലെ ഭക്ഷണത്തിന്റെ സുഖം മറ്റൊ...",AI,"എങ്കിലും , തട്ടുകടയിലെ ഭക്ഷണത്തിന്റെ സുഖം മറ്റ...",0
797,MAL_HUAI_TR_798,"പോറോട്ട, ബീഫ് കറി, സാലഡ് - ഈ കോമ്പിനേഷനിൽ നിന്...",AI,"പോറോട്ട , ബീഫ് കറി , സാലഡ് - ഈ കോമ്പിനേഷനിൽ നി...",0
798,MAL_HUAI_TR_799,"നല്ല ഉഴുന്നുവട്ടിയും, കിടിലൻ ചമ്മന്തിയും ചേർന്...",AI,"നല്ല ഉഴുന്നുവട്ടിയും , കിടിലൻ ചമ്മന്തിയും ചേർന...",0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)

def extract_embeddings(model_name, texts):
    """Extract embeddings for the given texts using a pre-trained transformer model."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            encoded_inputs = {key: tensor.to(device) for key, tensor in encoded_inputs.items()}
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)

X_train_embeddings = extract_embeddings("bert-base-multilingual-cased", X_train.tolist())
X_test_embeddings = extract_embeddings("bert-base-multilingual-cased", X_test.tolist())


In [None]:
from tensorflow.keras.utils import to_categorical

y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

model = Sequential([
    Dense(256, input_dim=X_train_embeddings.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(
    X_train_embeddings, y_train_cat,
    validation_data=(X_test_embeddings, y_test_cat),
    epochs=100, batch_size=32
)

loss, accuracy = model.evaluate(X_test_embeddings, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")

y_pred = model.predict(X_test_embeddings)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.6741 - loss: 0.8844 - val_accuracy: 0.9250 - val_loss: 0.3991
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8918 - loss: 0.2624 - val_accuracy: 0.9438 - val_loss: 0.3021
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9336 - loss: 0.1865 - val_accuracy: 0.9563 - val_loss: 0.2358
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9368 - loss: 0.1457 - val_accuracy: 0.9438 - val_loss: 0.2133
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9398 - loss: 0.1572 - val_accuracy: 0.9438 - val_loss: 0.1829
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9346 - loss: 0.1697 - val_accuracy: 0.9375 - val_loss: 0.1689
Epoch 7/100
[1m20/20[0m [32m━

In [None]:
from tensorflow.keras.models import load_model

model.save("Mala_classification_model.h5")
print("Model saved as 'Mala_classification_model.h5'")



Model saved as 'Mala_classification_model.h5'


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import load_model
import pickle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

test_file_path = "/content/tam_test_data_hum_ai.xlsx"
test_df = pd.read_excel(test_file_path)

test_df.columns = ['id', 'transcript']
print("Test dataset loaded.")

test_df['cleaned_transcript'] = test_df['transcript'].apply(preprocess_tamil_text)

label_encoder_path = "/content/tamil_label_encoder.pkl"
with open(label_encoder_path, "rb") as f:
    label_encoder = pickle.load(f)
print(f"Label encoder loaded from {label_encoder_path}")

def extract_embeddings(model_name, texts):
    """Extract embeddings for the given texts using a pre-trained transformer model."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            encoded_inputs = {key: tensor.to(device) for key, tensor in encoded_inputs.items()}
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)

X_test_embeddings = extract_embeddings("bert-base-multilingual-cased", test_df['cleaned_transcript'].tolist())
print("Embeddings generated for test data.")

model_path = "/content/tamil_classification_model.h5"  
model = load_model(model_path)
print(f"Model loaded from {model_path}")

y_pred = model.predict(X_test_embeddings)
y_pred_labels = np.argmax(y_pred, axis=1)

test_df['predicted_label'] = label_encoder.inverse_transform(y_pred_labels)

output_file_path_tsv = "./tam_test_predictions_simple.tsv"
output_df = test_df[['id', 'predicted_label']]
output_df.to_csv(output_file_path_tsv, sep='\t', index=False)

print(f"Predictions saved to {output_file_path_tsv}")

Using device: cpu
Test dataset loaded.
Label encoder loaded from /content/tamil_label_encoder.pkl




Embeddings generated for test data.
Model loaded from /content/tamil_classification_model.h5




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Predictions saved to ./tam_test_predictions_simple.tsv


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import load_model
import pickle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

test_file_path = "/content/mal_test_data_hum_ai.xlsx"
test_df = pd.read_excel(test_file_path)

test_df.columns = ['id', 'transcript']
print("Test dataset loaded.")

test_df['cleaned_transcript'] = test_df['transcript'].apply(preprocess_tamil_text)

label_encoder_path = "/content/mal_label_encoder.pkl"
with open(label_encoder_path, "rb") as f:
    label_encoder = pickle.load(f)
print(f"Label encoder loaded from {label_encoder_path}")

def extract_embeddings(model_name, texts):
    """Extract embeddings for the given texts using a pre-trained transformer model."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            encoded_inputs = {key: tensor.to(device) for key, tensor in encoded_inputs.items()}
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)

X_test_embeddings = extract_embeddings("bert-base-multilingual-cased", test_df['cleaned_transcript'].tolist())
print("Embeddings generated for test data.")

model_path = "/content/Mala_classification_model.h5"  
model = load_model(model_path)
print(f"Model loaded from {model_path}")

y_pred = model.predict(X_test_embeddings)
y_pred_labels = np.argmax(y_pred, axis=1)

test_df['predicted_label'] = label_encoder.inverse_transform(y_pred_labels)

output_file_path_tsv = "./mal_test_predictions_simple.tsv"
output_df = test_df[['id', 'predicted_label']]
output_df.to_csv(output_file_path_tsv, sep='\t', index=False)

print(f"Predictions saved to {output_file_path_tsv}")

Using device: cpu
Test dataset loaded.
Label encoder loaded from /content/mal_label_encoder.pkl




Embeddings generated for test data.
Model loaded from /content/Mala_classification_model.h5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Predictions saved to ./mal_test_predictions_simple.tsv


In [None]:
import pandas as pd

tsv_file_path = "/content/tam_test_predictions_simple.tsv"

df_tsv = pd.read_csv(tsv_file_path, sep='\t')

print("TSV File Contents:")
print(df_tsv)

TSV File Contents:
                 id predicted_label
0   TAM_HUAI_TE_001           HUMAN
1   TAM_HUAI_TE_002           HUMAN
2   TAM_HUAI_TE_003           HUMAN
3   TAM_HUAI_TE_004           HUMAN
4   TAM_HUAI_TE_005           HUMAN
..              ...             ...
95  TAM_HUAI_TE_096           HUMAN
96  TAM_HUAI_TE_097           HUMAN
97  TAM_HUAI_TE_098           HUMAN
98  TAM_HUAI_TE_099           HUMAN
99  TAM_HUAI_TE_100           HUMAN

[100 rows x 2 columns]


In [None]:
import pandas as pd

tsv_file_path = "/content/mal_test_predictions_simple.tsv"

df_tsv = pd.read_csv(tsv_file_path, sep='\t')

print("TSV File Contents:")
print(df_tsv)

TSV File Contents:
                  id predicted_label
0    MAL_HUAI_TE_001           HUMAN
1    MAL_HUAI_TE_002           HUMAN
2    MAL_HUAI_TE_003           HUMAN
3    MAL_HUAI_TE_004           HUMAN
4    MAL_HUAI_TE_005           HUMAN
..               ...             ...
195  MAL_HUAI_TE_196              AI
196  MAL_HUAI_TE_197              AI
197  MAL_HUAI_TE_198              AI
198  MAL_HUAI_TE_199           HUMAN
199  MAL_HUAI_TE_200              AI

[200 rows x 2 columns]


Tamil CountVectorizer and TFIDFVectorizer

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import advertools as adv
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tensorflow.keras.utils import to_categorical

file_path = "/content/tam_training_data_hum_ai (1).csv"
dataset_df = pd.read_csv(file_path)
dataset_df.columns = ['id', 'transcript', 'class_label']
print(dataset_df.head())

stopwords = list(sorted(adv.stopwords['tamil']))

def preprocess_tamil_text(text):
    """Preprocess Tamil text by normalizing, tokenizing, and removing stopwords."""
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ta")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ta"))
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)

dataset_df['cleaned_transcript'] = dataset_df['transcript'].apply(preprocess_tamil_text)
print(dataset_df.head())

label_encoder = LabelEncoder()
dataset_df['encoded_label'] = label_encoder.fit_transform(dataset_df['class_label'])

label_encoder_path = "tamil_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")

X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

count_vectorizer = CountVectorizer(max_features=5000)
X_train_count = count_vectorizer.fit_transform(X_train).toarray()
X_test_count = count_vectorizer.transform(X_test).toarray()

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)

with open("count_vectorizer.pkl", "wb") as f:
    pickle.dump(count_vectorizer, f)

print("TF-IDF and Count Vectorizers saved successfully.")

y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

def build_model(input_dim, output_dim):
    model = Sequential([
        Dense(256, input_dim=input_dim, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(output_dim, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

print("Training with TF-IDF features...")
model_tfidf = build_model(X_train_tfidf.shape[1], len(label_encoder.classes_))
history_tfidf = model_tfidf.fit(
    X_train_tfidf, y_train_cat,
    validation_data=(X_test_tfidf, y_test_cat),
    epochs=100, batch_size=32, verbose=1
)

loss_tfidf, accuracy_tfidf = model_tfidf.evaluate(X_test_tfidf, y_test_cat, verbose=0)
print(f"TF-IDF Model Test Accuracy: {accuracy_tfidf:.4f}")

y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
y_pred_tfidf_labels = np.argmax(y_pred_tfidf, axis=1)
print("TF-IDF Classification Report:")
print(classification_report(y_test, y_pred_tfidf_labels, target_names=label_encoder.classes_))

print("Training with Count Vectorizer features...")
model_count = build_model(X_train_count.shape[1], len(label_encoder.classes_))
history_count = model_count.fit(
    X_train_count, y_train_cat,
    validation_data=(X_test_count, y_test_cat),
    epochs=100, batch_size=32, verbose=1
)

loss_count, accuracy_count = model_count.evaluate(X_test_count, y_test_cat, verbose=0)
print(f"Count Vectorizer Model Test Accuracy: {accuracy_count:.4f}")

y_pred_count = model_count.predict(X_test_count)
y_pred_count_labels = np.argmax(y_pred_count, axis=1)
print("Count Vectorizer Classification Report:")
print(classification_report(y_test, y_pred_count_labels, target_names=label_encoder.classes_))

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting advertools
  Downloading advertools-0.16.4-py2.py3-none-any.whl.metadata (15 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting scrapy>=2.5.0 (from advertools)
  Downloading Scrapy-2.12.0-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting twython>=3.8.0 (from advertools)
  Downloading twython-3.9.1-py3-none-any.whl.metadata (20 kB)
Collecting Twisted>=21.7.0 (from scrapy>=2.5.0->advertools)
  Downloading twisted-24.11.0-py3-none-any.whl.metadata (20 kB)
Collecting cssselect>=0.9.1 (from scrapy>=2.5.0->advertools)
  Downloading cssselect-1.2.0-py2.py3-no

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 32ms/step - accuracy: 0.5120 - loss: 1.2766 - val_accuracy: 0.4691 - val_loss: 0.6854
Epoch 2/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6637 - loss: 0.7851 - val_accuracy: 0.4691 - val_loss: 0.6884
Epoch 3/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7558 - loss: 0.5879 - val_accuracy: 0.4691 - val_loss: 0.6986
Epoch 4/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7723 - loss: 0.4836 - val_accuracy: 0.4691 - val_loss: 0.7022
Epoch 5/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8248 - loss: 0.4265 - val_accuracy: 0.4691 - val_loss: 0.6999
Epoch 6/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.8326 - loss: 0.3831 - val_accuracy: 0.4691 - val_loss: 0.7066
Epoch 7/100
[1m21/21[0m [

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.5984 - loss: 1.0336 - val_accuracy: 0.7284 - val_loss: 0.6287
Epoch 2/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7081 - loss: 0.7503 - val_accuracy: 0.6975 - val_loss: 0.6149
Epoch 3/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7289 - loss: 0.6784 - val_accuracy: 0.6728 - val_loss: 0.6059
Epoch 4/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8235 - loss: 0.4807 - val_accuracy: 0.6605 - val_loss: 0.5978
Epoch 5/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8066 - loss: 0.4302 - val_accuracy: 0.6296 - val_loss: 0.5861
Epoch 6/100
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8613 - loss: 0.3843 - val_accuracy: 0.6358 - val_loss: 0.5823
Epoch 7/100
[1m21/21[0m [32m━━━━━━━━━━━━━━

Malayalam CountVectorizer and TFIDFVectorizer

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
from tensorflow.keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pickle

file_path = "/content/mal_training_data_hum_ai.csv"
dataset_df = pd.read_csv(file_path)
dataset_df.columns = ['id', 'transcript', 'class_label']
print(dataset_df.head())

stop = [
    "അവൻ", "അവൾ", "അവർ", "ആ", "ആകാം", "ആകുന്നു", "ആകും", "ആകെയുള്ള", "ആകെയുള്ളത്", "ആകെയുള്ളവ", "ആകെയുള്ളവർ",
    "ആകെയുള്ളവൻ", "ആകെയുള്ളവൾ", "ആകെയുള്ളവൾക്ക്", "ആകുള്ളവൾക്ക്‌", "ഇത്", "ഇതിൽ", "ഇതിന്റെ", "ഇതും", "ഇതെല്ലാം",
    "ഇവ", "ഇവയിൽ", "ഇവയുടെ", "ഇവയും", "ഇവയെല്ലാം", "ഇവൻ", "ഇവൾ", "ഇവർ", "ഇവരുടെ", "ഇവരിൽ", "ഇവരെയും", "ഇവരെയെല്ലാം",
    "ഇവരോട്", "ഇവരോടും", "ഇവരോടുള്ള", "ഇവരോടുള്ളത്", "ഇവരോടുള്ളവ", "ഇവരോടുള്ളവർ", "ഇവരോടുള്ളവൻ", "ഇവരോടുള്ളവൾ",
    "ഇവരോടുള്ളവൾക്ക്", "ഇവരോടുള്ളവൾക്ക്‌"
]

def preprocess_malayalam_text(text):
    """Preprocess Malayalam text by normalizing, tokenizing, and removing stopwords."""
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ml")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ml"))
    tokens = [token for token in tokens if token not in stop]
    return ' '.join(tokens)

dataset_df['cleaned_transcript'] = dataset_df['transcript'].apply(preprocess_malayalam_text)
print(dataset_df.head())

label_encoder = LabelEncoder()
dataset_df['encoded_label'] = label_encoder.fit_transform(dataset_df['class_label'])

label_encoder_path = "malayalam_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")

X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

count_vectorizer = CountVectorizer(max_features=5000)
X_train_count = count_vectorizer.fit_transform(X_train).toarray()
X_test_count = count_vectorizer.transform(X_test).toarray()

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)

with open("count_vectorizer.pkl", "wb") as f:
    pickle.dump(count_vectorizer, f)

print("TF-IDF and Count Vectorizers saved successfully.")

y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

def build_model(input_dim, output_dim):
    model = Sequential([
        Dense(256, input_dim=input_dim, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(output_dim, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

print("Training with TF-IDF features...")
model_tfidf = build_model(X_train_tfidf.shape[1], len(label_encoder.classes_))
history_tfidf = model_tfidf.fit(
    X_train_tfidf, y_train_cat,
    validation_data=(X_test_tfidf, y_test_cat),
    epochs=100, batch_size=32, verbose=1
)

loss_tfidf, accuracy_tfidf = model_tfidf.evaluate(X_test_tfidf, y_test_cat, verbose=0)
print(f"TF-IDF Model Test Accuracy: {accuracy_tfidf:.4f}")

y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
y_pred_tfidf_labels = np.argmax(y_pred_tfidf, axis=1)
print("TF-IDF Classification Report:")
print(classification_report(y_test, y_pred_tfidf_labels, target_names=label_encoder.classes_))


print("Training with Count Vectorizer features...")
model_count = build_model(X_train_count.shape[1], len(label_encoder.classes_))
history_count = model_count.fit(
    X_train_count, y_train_cat,
    validation_data=(X_test_count, y_test_cat),
    epochs=100, batch_size=32, verbose=1
)

loss_count, accuracy_count = model_count.evaluate(X_test_count, y_test_cat, verbose=0)
print(f"Count Vectorizer Model Test Accuracy: {accuracy_count:.4f}")

y_pred_count = model_count.predict(X_test_count)
y_pred_count_labels = np.argmax(y_pred_count, axis=1)
print("Count Vectorizer Classification Report:")
print(classification_report(y_test, y_pred_count_labels, target_names=label_encoder.classes_))

                id                                         transcript  \
0  MAL_HUAI_TR_001  ഞാൻ കുറച്ച് കാലമായി മുച്ചട്ച്ചിൻ്റെ ഫേസ് വാഷ് ...   
1  MAL_HUAI_TR_002           ഈ ഫേസ് വാഷ് തണുപ്പ് വെതറിലും ഉപയോഗിക്കാം   
2  MAL_HUAI_TR_003  അണ്ണാ എനിക്ക് 14 വയസ് ആയ തേയോളു എനിക്ക് സ്കിൻക...   
3  MAL_HUAI_TR_004  ബ്രോ ഇതെല്ലം യൂസ്  ആക്കീട്ട് നൈറ്റ് പിന്നെ വേറ...   
4  MAL_HUAI_TR_005    ഇത് ഫേസ് വാഷ് ഡെയിലി ചെയ്താ സ്കിൻകെയറിന് നല്ലതാ   

  class_label  
0       HUMAN  
1       HUMAN  
2       HUMAN  
3       HUMAN  
4       HUMAN  
                id                                         transcript  \
0  MAL_HUAI_TR_001  ഞാൻ കുറച്ച് കാലമായി മുച്ചട്ച്ചിൻ്റെ ഫേസ് വാഷ് ...   
1  MAL_HUAI_TR_002           ഈ ഫേസ് വാഷ് തണുപ്പ് വെതറിലും ഉപയോഗിക്കാം   
2  MAL_HUAI_TR_003  അണ്ണാ എനിക്ക് 14 വയസ് ആയ തേയോളു എനിക്ക് സ്കിൻക...   
3  MAL_HUAI_TR_004  ബ്രോ ഇതെല്ലം യൂസ്  ആക്കീട്ട് നൈറ്റ് പിന്നെ വേറ...   
4  MAL_HUAI_TR_005    ഇത് ഫേസ് വാഷ് ഡെയിലി ചെയ്താ സ്കിൻകെയറിന് നല്ലതാ   

  class_label             

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 36ms/step - accuracy: 0.5010 - loss: 1.3412 - val_accuracy: 0.5125 - val_loss: 0.6847
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.7138 - loss: 0.6772 - val_accuracy: 0.5000 - val_loss: 0.6795
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7542 - loss: 0.5240 - val_accuracy: 0.5000 - val_loss: 0.6744
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8090 - loss: 0.4641 - val_accuracy: 0.5063 - val_loss: 0.6684
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8941 - loss: 0.2533 - val_accuracy: 0.5063 - val_loss: 0.6676
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9044 - loss: 0.2541 - val_accuracy: 0.5063 - val_loss: 0.6728
Epoch 7/100
[1m20/20[0m [

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.4840 - loss: 1.3845 - val_accuracy: 0.5000 - val_loss: 0.6868
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6774 - loss: 0.6765 - val_accuracy: 0.5000 - val_loss: 0.6996
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7882 - loss: 0.4988 - val_accuracy: 0.5000 - val_loss: 0.7131
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8071 - loss: 0.4594 - val_accuracy: 0.5000 - val_loss: 0.7446
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8392 - loss: 0.3740 - val_accuracy: 0.5000 - val_loss: 0.7645
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8678 - loss: 0.2737 - val_accuracy: 0.5063 - val_loss: 0.7688
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━━━━━━

MALAYALAM xlm-roberta-large

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import pickle

file_path = "/content/mal_training_data_hum_ai.csv"
dataset_df = pd.read_csv(file_path)
dataset_df.columns = ['id', 'transcript', 'class_label']
print(dataset_df.head())

stop = [
    "അവൻ", "അവൾ", "അവർ", "ആ", "ആകാം", "ആകുന്നു", "ആകും", "ആകെയുള്ള", "ആകെയുള്ളത്", "ആകെയുള്ളവ", "ആകെയുള്ളവർ",
    "ആകെയുള്ളവൻ", "ആകെയുള്ളവൾ", "ആകെയുള്ളവൾക്ക്", "ആകുള്ളവൾക്ക്‌", "ഇത്", "ഇതിൽ", "ഇതിന്റെ", "ഇതും", "ഇതെല്ലാം",
    "ഇവ", "ഇവയിൽ", "ഇവയുടെ", "ഇവയും", "ഇവയെല്ലാം", "ഇവൻ", "ഇവൾ", "ഇവർ", "ഇവരുടെ", "ഇവരിൽ", "ഇവരെയും", "ഇവരെയെല്ലാം",
    "ഇവരോട്", "ഇവരോടും", "ഇവരോടുള്ള", "ഇവരോടുള്ളത്", "ഇവരോടുള്ളവ", "ഇവരോടുള്ളവർ", "ഇവരോടുള്ളവൻ", "ഇവരോടുള്ളവൾ",
    "ഇവരോടുള്ളവൾക്ക്", "ഇവരോടുള്ളവൾക്ക്‌"
]

def preprocess_malayalam_text(text):
    """Preprocess Malayalam text by normalizing, tokenizing, and removing stopwords."""
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ml")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ml"))
    tokens = [token for token in tokens if token not in stop]
    return ' '.join(tokens)

dataset_df['cleaned_transcript'] = dataset_df['transcript'].apply(preprocess_malayalam_text)
print(dataset_df.head())

label_encoder = LabelEncoder()
dataset_df['encoded_label'] = label_encoder.fit_transform(dataset_df['class_label'])

label_encoder_path = "malayalam_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")

X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)

model_name = "xlm-roberta-large"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def tokenize_texts(texts, tokenizer, max_length=512):
    inputs = tokenizer(
        texts.tolist(),
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    return inputs

X_train_tokens = tokenize_texts(X_train, tokenizer)
X_test_tokens = tokenize_texts(X_test, tokenizer)

def extract_embeddings(tokens, model):
    with torch.no_grad():
        outputs = model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()  
    return embeddings

X_train_embeddings = extract_embeddings(X_train_tokens, model)
X_test_embeddings = extract_embeddings(X_test_tokens, model)

def build_model(input_dim, output_dim):
    model = Sequential([
        Dense(256, input_dim=input_dim, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(output_dim, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

from tensorflow.keras.utils import to_categorical
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

print("Training with XLM-Roberta embeddings...")
classification_model = build_model(X_train_embeddings.shape[1], len(label_encoder.classes_))
history = classification_model.fit(
    X_train_embeddings, y_train_cat,
    validation_data=(X_test_embeddings, y_test_cat),
    epochs=50, batch_size=32, verbose=1
)

loss, accuracy = classification_model.evaluate(X_test_embeddings, y_test_cat, verbose=0)
print(f"Test Accuracy: {accuracy:.4f}")

y_pred = classification_model.predict(X_test_embeddings)
y_pred_labels = np.argmax(y_pred, axis=1)
print("Classification Report:")
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

                id                                         transcript  \
0  MAL_HUAI_TR_001  ഞാൻ കുറച്ച് കാലമായി മുച്ചട്ച്ചിൻ്റെ ഫേസ് വാഷ് ...   
1  MAL_HUAI_TR_002           ഈ ഫേസ് വാഷ് തണുപ്പ് വെതറിലും ഉപയോഗിക്കാം   
2  MAL_HUAI_TR_003  അണ്ണാ എനിക്ക് 14 വയസ് ആയ തേയോളു എനിക്ക് സ്കിൻക...   
3  MAL_HUAI_TR_004  ബ്രോ ഇതെല്ലം യൂസ്  ആക്കീട്ട് നൈറ്റ് പിന്നെ വേറ...   
4  MAL_HUAI_TR_005    ഇത് ഫേസ് വാഷ് ഡെയിലി ചെയ്താ സ്കിൻകെയറിന് നല്ലതാ   

  class_label  
0       HUMAN  
1       HUMAN  
2       HUMAN  
3       HUMAN  
4       HUMAN  
                id                                         transcript  \
0  MAL_HUAI_TR_001  ഞാൻ കുറച്ച് കാലമായി മുച്ചട്ച്ചിൻ്റെ ഫേസ് വാഷ് ...   
1  MAL_HUAI_TR_002           ഈ ഫേസ് വാഷ് തണുപ്പ് വെതറിലും ഉപയോഗിക്കാം   
2  MAL_HUAI_TR_003  അണ്ണാ എനിക്ക് 14 വയസ് ആയ തേയോളു എനിക്ക് സ്കിൻക...   
3  MAL_HUAI_TR_004  ബ്രോ ഇതെല്ലം യൂസ്  ആക്കീട്ട് നൈറ്റ് പിന്നെ വേറ...   
4  MAL_HUAI_TR_005    ഇത് ഫേസ് വാഷ് ഡെയിലി ചെയ്താ സ്കിൻകെയറിന് നല്ലതാ   

  class_label             

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Training with XLM-Roberta embeddings...
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.6557 - loss: 0.9565 - val_accuracy: 0.5000 - val_loss: 0.6536
Epoch 2/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8854 - loss: 0.2965 - val_accuracy: 0.5000 - val_loss: 0.6191
Epoch 3/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9046 - loss: 0.2532 - val_accuracy: 0.6812 - val_loss: 0.5374
Epoch 4/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9473 - loss: 0.1282 - val_accuracy: 0.7500 - val_loss: 0.4958
Epoch 5/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8983 - loss: 0.2835 - val_accuracy: 0.5000 - val_loss: 0.5900
Epoch 6/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9353 - loss: 0.1688 - val_accuracy: 0.8188 - val_loss: 0.4536
Epoch 7/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━

TAMIL xlm-roberta-large

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import pickle

file_path = "/content/tam_training_data_hum_ai (1).csv"
dataset_df = pd.read_csv(file_path)
dataset_df.columns = ['id', 'transcript', 'class_label']
print(dataset_df.head())

stopwords = list(sorted(adv.stopwords['tamil']))

def preprocess_tamil_text(text):
    """Preprocess Tamil text by normalizing, tokenizing, and removing stopwords."""
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ta")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ta"))
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)

dataset_df['cleaned_transcript'] = dataset_df['transcript'].apply(preprocess_tamil_text)
print(dataset_df.head())

label_encoder = LabelEncoder()
dataset_df['encoded_label'] = label_encoder.fit_transform(dataset_df['class_label'])

label_encoder_path = "tamil_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")

X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)

model_name = "xlm-roberta-large"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def tokenize_texts(texts, tokenizer, max_length=512):
    inputs = tokenizer(
        texts.tolist(),
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    return inputs

X_train_tokens = tokenize_texts(X_train, tokenizer)
X_test_tokens = tokenize_texts(X_test, tokenizer)

def extract_embeddings(tokens, model):
    with torch.no_grad():
        outputs = model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()  
    return embeddings

X_train_embeddings = extract_embeddings(X_train_tokens, model)
X_test_embeddings = extract_embeddings(X_test_tokens, model)

def build_model(input_dim, output_dim):
    model = Sequential([
        Dense(256, input_dim=input_dim, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(output_dim, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

from tensorflow.keras.utils import to_categorical
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

print("Training with XLM-Roberta embeddings...")
classification_model = build_model(X_train_embeddings.shape[1], len(label_encoder.classes_))
history = classification_model.fit(
    X_train_embeddings, y_train_cat,
    validation_data=(X_test_embeddings, y_test_cat),
    epochs=50, batch_size=32, verbose=1
)

loss, accuracy = classification_model.evaluate(X_test_embeddings, y_test_cat, verbose=0)
print(f"Test Accuracy: {accuracy:.4f}")

y_pred = classification_model.predict(X_test_embeddings)
y_pred_labels = np.argmax(y_pred, axis=1)
print("Classification Report:")
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

                id                                         transcript  \
0  TAM_HUAI_TR_001  இந்த சோப்பின் மணம் மிகவும் புத்துணர்ச்சியூட்டு...   
1  TAM_HUAI_TR_002   தோலை நன்கு சுத்தம் செய்ய இது மிகவும் சிறப்பானது.   
2  TAM_HUAI_TR_003  இதைப் பயன்படுத்திய பிறகு, தோல் மிக மென்மையாக உ...   
3  TAM_HUAI_TR_004  இந்த சோப்பில் இயற்கையான மூலப்பொருட்கள் பயன்படு...   
4  TAM_HUAI_TR_005        சிறிது சோப்பு போதும், அதிக நுரை உருவாகிறது.   

  class_label  
0          AI  
1          AI  
2          AI  
3          AI  
4          AI  
                id                                         transcript  \
0  TAM_HUAI_TR_001  இந்த சோப்பின் மணம் மிகவும் புத்துணர்ச்சியூட்டு...   
1  TAM_HUAI_TR_002   தோலை நன்கு சுத்தம் செய்ய இது மிகவும் சிறப்பானது.   
2  TAM_HUAI_TR_003  இதைப் பயன்படுத்திய பிறகு, தோல் மிக மென்மையாக உ...   
3  TAM_HUAI_TR_004  இந்த சோப்பில் இயற்கையான மூலப்பொருட்கள் பயன்படு...   
4  TAM_HUAI_TR_005        சிறிது சோப்பு போதும், அதிக நுரை உருவாகிறது.   

  class_label             

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.7341 - loss: 0.5705 - val_accuracy: 0.4691 - val_loss: 0.6636
Epoch 2/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8986 - loss: 0.3008 - val_accuracy: 0.4815 - val_loss: 0.6318
Epoch 3/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9207 - loss: 0.1968 - val_accuracy: 0.5123 - val_loss: 0.5995
Epoch 4/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9394 - loss: 0.1570 - val_accuracy: 0.6667 - val_loss: 0.5326
Epoch 5/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9355 - loss: 0.1841 - val_accuracy: 0.9136 - val_loss: 0.4593
Epoch 6/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9527 - loss: 0.1696 - val_accuracy: 0.9074 - val_loss: 0.4171
Epoch 7/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━