<a href="https://colab.research.google.com/github/rashed963/LanguageIdentificationNLP/blob/main/LanguageIdentification_multiple_approaches.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import word_tokenize

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
def load_data_sample(filepath, sample_size=10, random_state=42):
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    random.seed(random_state)
    sampled_indices = random.sample(range(len(lines)), sample_size)
    sampled_lines = [lines[i].strip() for i in sampled_indices]
    return sampled_lines

def train_model(X_train, y_train):
    model = make_pipeline(CountVectorizer(analyzer='char', ngram_range=(3, 3)), LogisticRegression(max_iter=1000))
    model.fit(X_train, y_train)
    return model

def predict_language(text, model):
    return model.predict([text])[0]


In [18]:
filepath = '/content/drive/MyDrive/data/train/x_train.txt'
X_train = load_data_sample(filepath,sample_size=10000)
filepath = '/content/drive/MyDrive/data/train/y_train.txt'
y_train = load_data_sample(filepath,sample_size=10000)
filepath = '/content/drive/MyDrive/data/test/x_test.txt'
X_test = load_data_sample(filepath,sample_size=10000)
filepath = '/content/drive/MyDrive/data/test/y_test.txt'
y_test = load_data_sample(filepath,sample_size=10000)



**V1. Logistic Regression**

In [7]:

# Train the model
model = train_model(X_train, y_train)

# Evaluate the model
y_pred = [predict_language(text, model) for text in X_test]
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.17


**V2. search for the best hyberparams values**

Enhance the Pipeline: Your existing pipeline uses a count vectorizer and logistic regression. You can add hyperparameter options to both.
Setup GridSearchCV: Define the parameter grid that GridSearchCV will explore.
Cross-validation Setup: Choose a suitable cross-validation strategy, typically k-fold cross-validation.

In [21]:
def train_and_tune_model(X_train, y_train):
    pipeline = make_pipeline(
        CountVectorizer(analyzer='char', ngram_range=(3, 3)),
        LogisticRegression(max_iter=10)
    )
    parameters = {
        # 'countvectorizer__ngram_range': [(1,1), (1,2), (2,2), (2,3), (3,3)],
        # 'logisticregression__C': [0.01, 0.1, 1, 10, 100]
        'countvectorizer__ngram_range': [(1,1)],
        'logisticregression__C': [1]
    }
    grid_search = GridSearchCV(pipeline, parameters, cv=3, verbose=1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

In [None]:
# Train the model with hyperparameter tuning
model = train_and_tune_model(X_train, y_train)
print(model.decision_function)

In [23]:
# Predict and evaluate
y_pred = [predict_language(text, model) for text in X_test]
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.235


**V2. Exploring different vectorization and feature extraction techniques for language identification**



In [29]:
import random
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec


def predict_language(text, model, vectorizer_type='tfidf'):
    if vectorizer_type == 'tfidf':
        # For TF-IDF, predict directly using the model
        return model.predict([text])[0]
    elif vectorizer_type == 'word2vec':
        # For Word2Vec, transform the text first
        # Tokenize the single text
        tokenized_text = word_tokenize(text)
        # Transform the text using Word2Vec
        transformed_text = word2vec_transform([tokenized_text])
        # Reshape the input to (1, -1), which is (1 sample, N features)
        transformed_text = transformed_text.reshape(1, -1)
        return model.predict(transformed_text)[0]

# Function to train Word2Vec and transform data
def word2vec_transform(sentences, vector_size=100, window=5, min_count=1, epochs=10):
    # Initialize and train a Word2Vec model
    model = Word2Vec(sentences, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs)
    word_vectors = model.wv
    # Compute the average of word vectors for each sentence
    return np.array([
        np.mean([word_vectors[w] for w in words if w in word_vectors.key_to_index] or [np.zeros(vector_size)], axis=0)
        for words in sentences
    ])

# Define the model and hyperparameters
def train_and_tune_model(X_train, y_train, vectorizer_type='tfidf'):
    if vectorizer_type == 'tfidf':
        vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 3))
        pipeline = make_pipeline(vectorizer, LogisticRegression(max_iter=1000))
    elif vectorizer_type == 'word2vec':
        X_train_transformed = word2vec_transform(X_train)
        pipeline = Pipeline([('classifier', LogisticRegression(max_iter=1000))])
        pipeline.fit(X_train_transformed, y_train)
        return pipeline

    parameters = {'logisticregression__C': [1]}  # Simplified for this example
    if vectorizer_type == 'tfidf':
        parameters['tfidfvectorizer__ngram_range'] = [(1,1)]  # Simplified for this example

    if vectorizer_type != 'word2vec':
        grid_search = GridSearchCV(pipeline, parameters, cv=3, verbose=1)
        grid_search.fit(X_train, y_train)
        return grid_search.best_estimator_


model_tfidf = train_and_tune_model(X_train, y_train, vectorizer_type='tfidf')
model_word2vec = train_and_tune_model(X_train, y_train, vectorizer_type='word2vec')

y_pred_tfidf = [predict_language(text, model_tfidf, 'tfidf') for text in X_test]
print("TF-IDF Accuracy:", accuracy_score(y_test, y_pred_tfidf))

y_pred_word2vec = [predict_language(text, model_word2vec, 'word2vec') for text in X_test]
print("Word2Vec Accuracy:", accuracy_score(y_test, y_pred_word2vec))

Fitting 3 folds for each of 1 candidates, totalling 3 fits




TF-IDF Accuracy: 0.035
Word2Vec Accuracy: 0.0


**V3. **

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer


In [19]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

In [20]:
def encode_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)

X_train_enc = encode_texts(X_train)
X_test_enc = encode_texts(X_test)

In [21]:
# Combine all labels from both train and test sets to ensure all are known before encoding
all_labels = sorted(set(y_train) | set(y_test))  # Union of y_train and y_test labels, sorted for consistency

# Create a mapping from language code to a unique index
lang2idx = {lang: idx for idx, lang in enumerate(all_labels)}

# Function to encode labels based on the mapping
def encode_labels(labels):
    return np.array([lang2idx[lang] for lang in labels])

# Encode both training and testing labels
y_train_enc = encode_labels(y_train)
y_test_enc = encode_labels(y_test)

In [22]:
from transformers import AutoModelForSequenceClassification

num_labels = 235
model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=num_labels)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
from torch.utils.data import Dataset
import torch

class LanguageDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Assuming 'encode_texts' function from the previous example tokenizes and encodes texts
train_encodings = encode_texts(X_train)
test_encodings = encode_texts(X_test)

# Assuming 'encode_labels' converts labels to integer indices
train_labels = encode_labels(y_train)
test_labels = encode_labels(y_test)

# Create Dataset objects
train_dataset = LanguageDataset(train_encodings, train_labels)
eval_dataset = LanguageDataset(test_encodings, test_labels)


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./bert_base_model_results',  # directory to save model checkpoints
    num_train_epochs=3,            # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,              # weight decay for regularization
    logging_dir='./logs',           # directory for storing logs
    evaluation_strategy="epoch"     # evaluate the model at the end of each epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # training dataset
    eval_dataset=eval_dataset     # evaluation dataset
)

trainer.train()  # Start training


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [None]:
model.save_pretrained('./final_model')  # Save the fine-tuned model
tokenizer.save_pretrained('./final_model')  # Save the tokenizer used with the model


In [None]:
evaluation_results = trainer.evaluate()

evaluation_results

