<a href="https://colab.research.google.com/github/hassen8/Language_Detection/blob/main/Language_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Language Detection (Oromo, Somali, Swahili)


In [None]:
import pandas as pd
import string
import re
import nltk
nltk.download('punkt')
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Reading from the large datasets

Since the original datasets are very huge, we'll take only the first 20 bytes as that'll give us enough data

In [None]:
# Open the text files for reading
with open("/content/drive/MyDrive/Language Detection/sw.txt", "r") as f:
    # Read the first 20 MB of the file
    swahili = f.read(20 * 1024 * 1024)

with open("/content/drive/MyDrive/Language Detection/om.txt", "r") as f:
    # Read the first 10 MB of the file
    oromo = f.read(20 * 1024 * 1024)

with open("/content/drive/MyDrive/Language Detection/so.txt", "r") as f:
    # Read the first 10 MB of the file
    somali = f.read(20 * 1024 * 1024)

# Text cleaning and tokenization

In [None]:
def tokenizeToSentences(text):
      sentenceEndings = r'\?|\!|\.'
      sentences = re.split(sentenceEndings, text)
      sentences = list(filter(None, sentences))
      return sentences

In [None]:
def cleanText(text):        
      # Only keeps alphabet characters and removes everything else
      text = re.sub(r'[^a-zA-Z\s]', '', text)
      text = re.sub(r'[\r\n]', '', text)
      return text.lower()

And now we tokenize the sentences and place them in Dataframes.

In [None]:
om_sentences = tokenizeToSentences(oromo)
om_sentences = [sent for sent in om_sentences if len(sent) > 100]
df = pd.DataFrame(om_sentences, columns=['text'])
df['text'] = df['text'].apply(lambda x: cleanText(x))
df = df.assign(language = 'oromo')

In [None]:
sw_sentences = tokenizeToSentences(swahili)
sw_sentences = [sent for sent in sw_sentences if len(sent) > 100]
df2 = pd.DataFrame(sw_sentences, columns=['text'])
df2['text'] = df2['text'].apply(lambda x: cleanText(x))
df2 = df2.assign(language = 'swahili')

In [None]:
so_sentences = tokenizeToSentences(somali)
so_sentences = [sent for sent in so_sentences if len(sent) > 100]
df3 = pd.DataFrame(so_sentences, columns=['text'])
df3['text'] = df3['text'].apply(lambda x: cleanText(x))
df3 = df3.assign(language = 'somali')

Concatenate all dataframes into one

In [None]:
dataset = df.append([df2,df3], ignore_index=True)

Data inspection and insight

In [None]:
dataset.info()

In [None]:
dataset.language.value_counts()

In [None]:
dataset[dataset.language =='oromo'].sample(50)

# Training the model

In [None]:
x = dataset['text']

In [None]:
y=dataset['language']

Split the data into test and training


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=100)

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,4), analyzer='char')

In [None]:
model = pipeline.Pipeline([
    ('vectorizer', vectorizer),
    ('clf', LogisticRegression())
])

In [None]:
model.fit(x_train,y_train)

Checking the accuracy of the model

In [None]:
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print("Accuracy is :",accuracy)

# Trying out the model

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
def predict(text):
    lang = model.predict([text])
    print("Language is ", lang[0])

In [None]:
text = "Nilienda shule leo, na nilicheza mpira wa miguu"
predict(text)