In [None]:
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from joblib import dump, load
from google.colab import files


In [None]:
from google.colab import drive
drive.mount('/content/drive') #Mount Google Drive to access the dataset directly from Google Drive storage.

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Data preprocessing



In [None]:
import re

def preprocess_text(text):
    """
    Preprocess the input text for language identification.

    Args:
    text (str): The input text to preprocess.

    Returns:
    str: The preprocessed text.
    """
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters
    text = re.sub(r'[^a-z\s]', '', text)  # Keep lowercased letters and spaces

    return text


## Data Loading and Sampling

This section covers the loading and random sampling of the training and testing data. Sampling is particularly useful to reduce computation time while developing the model on Google Colab. We use a sample size of 25,000 for training and 10,000 for testing to ensure a representative subset of the full dataset.



In [None]:
def load_data_sample(filepath, sample_size=10, random_state=42):
      """Load a random sample of lines from a file to reduce memory usage and speed up computations."""
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    random.seed(random_state)
    sampled_indices = random.sample(range(len(lines)), sample_size)
    sampled_lines = [lines[i].strip() for i in sampled_indices]
    return sampled_lines

def load_datasets(train_data_path, train_labels_path, test_data_path, test_labels_path, train_sample_size=25000, test_sample_size=10000):
    """
    Load training and testing data and labels from specified file paths.

    Args:
    train_data_path (str): Path to the training data file.
    train_labels_path (str): Path to the training labels file.
    test_data_path (str): Path to the testing data file.
    test_labels_path (str): Path to the testing labels file.
    train_sample_size (int): Number of samples to load from the training data.
    test_sample_size (int): Number of samples to load from the testing data.

    Returns:
    tuple: Tuple containing loaded training data, training labels, testing data, testing labels.
    """
    # Load training data and labels
    X_train = load_data_sample(train_data_path, sample_size=train_sample_size)
    y_train = load_data_sample(train_labels_path, sample_size=train_sample_size)

    # Load testing data and labels
    X_test = load_data_sample(test_data_path, sample_size=test_sample_size)
    y_test = load_data_sample(test_labels_path, sample_size=test_sample_size)

    return X_train, y_train, X_test, y_test


# Define the base path for data files
base_path = '/content/drive/MyDrive/data'

# Paths to data files using the base path
train_data_path = f'{base_path}/train/x_train.txt'
train_labels_path = f'{base_path}/train/y_train.txt'
test_data_path = f'{base_path}/test/x_test.txt'
test_labels_path = f'{base_path}/test/y_test.txt'

## Model Training & Model Evaluation

- We employ a character-level bi-gram model combined with a Logistic Regression classifier. This approach is often effective for language identification tasks, where character n-grams serve as strong features for distinguishing between languages (Reference: []).

- After training, the model's performance is evaluated on the test set to ensure its effectiveness. Accuracy is the metric of choice for its interpretability and relevance in classification tasks.

In [None]:
def train_model(X_train, y_train):
      """Train a logistic regression model on character-level bi-gram features."""
    model = make_pipeline(CountVectorizer(analyzer='char', ngram_range=(1, 1)), LogisticRegression(max_iter=50))
    model.fit(X_train, y_train)
    return model

def predict_language(text, model):
    return model.predict([text])[0]

In [None]:
# Load all datasets using simplified paths
X_train, y_train, X_test, y_test = load_datasets(train_data_path, train_labels_path, test_data_path, test_labels_path)


# filepath = '/content/drive/MyDrive/data/train/x_train.txt'
# X_train = load_data_sample(filepath,sample_size=25000)
# filepath = '/content/drive/MyDrive/data/train/y_train.txt'
# y_train = load_data_sample(filepath,sample_size=25000)
# filepath = '/content/drive/MyDrive/data/test/x_test.txt'
# X_test = load_data_sample(filepath,sample_size=10000)
# filepath = '/content/drive/MyDrive/data/test/y_test.txt'
# y_test = load_data_sample(filepath,sample_size=10000)

**V1. Logistic Regression**

In [None]:
# Train the model
model = train_model(X_train, y_train)

# Evaluate the model
y_pred = [predict_language(text, model) for text in X_test]
print("Accuracy:", accuracy_score(y_test, y_pred))

KeyboardInterrupt: 

In [None]:
# dump(model_lr_bigram, 'model_lr_bigram.joblib')
files.download('model_lr_bigram.joblib')

**Conclusion**

In this notebook, we developed a language identification model using a character-level bi-gram representation and Logistic Regression. This approach is effective for distinguishing between different languages due to the unique character patterns and sequences found in languages.

We utilized a relatively simple yet powerful machine learning pipeline consisting of a CountVectorizer for feature extraction and LogisticRegression for classification. This combination is not only computationally efficient but also performs well on textual data that varies in linguistic structure.

The model was trained and evaluated on a sampled subset of a larger dataset to expedite the computation process, particularly suitable for environments like Google Colab with limited computational resources. The final model achieved an accuracy that indicates a strong performance, though specific results were not detailed.

This exercise demonstrates the utility of traditional machine learning approaches in natural language processing tasks like language identification. It highlights the importance of feature representation and the choice of classifier in achieving good performance.

For future work, exploring more complex models or incorporating additional features such as word-level n-grams or deep learning techniques could potentially improve accuracy, especially in distinguishing closely related languages. Moreover, expanding the dataset or employing cross-validation would enhance the model's robustness and generalizability.