In [68]:
import numpy as np
import string
import pandas as pd

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [69]:
dataset = pd.read_csv('dataset.csv')

In [None]:
languages = list(set(dataset['language']))
data_raw = dict()
a = dataset.groupby('language')
for l in languages:
    d = a.get_group(l)
    d.drop('language',axis=1,inplace=True)
    t = d.Text.str.cat(sep="\n")
    t = t.split('\n')
    data_raw[l] = t

In [71]:
def show_statistics(data):
    for language, sentences in data.items():
        
        word_list = ' '.join(sentences).split()
        number_of_sentences = len(sentences)
        number_of_words = len(word_list)
        number_of_unique_words = len(set(word_list))
        sample_extract = " ".join(sentences[0].split()[0:7])
        
        print(f'Language: {language}')
        print('-----------------------')
        print(f'Number of sentences\t:\t {number_of_sentences}')
        print(f'Number of words\t\t:\t {number_of_words}')
        print(f'Number of unique words\t:\t {number_of_unique_words}')
        print(f'Sample extract\t\t:\t {sample_extract}...\n')

In [72]:
show_statistics(data_raw)

Language: Tamil
-----------------------
Number of sentences	:	 1000
Number of words		:	 38615
Number of unique words	:	 17159
Sample extract		:	 விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திரிகை-விசாகப்பட்டின ஆசிரியர் சம்பத்துடன் இணைந்து...

Language: Persian
-----------------------
Number of sentences	:	 1000
Number of words		:	 77553
Number of unique words	:	 12616
Sample extract		:	 آهن ترکیباتی را ایجاد می‌کند که عمدتاً...

Language: Latin
-----------------------
Number of sentences	:	 1000
Number of words		:	 42191
Number of unique words	:	 14671
Sample extract		:	 müller mox figura centralis circulorum doctorum vindobonesium...

Language: Romanian
-----------------------
Number of sentences	:	 1000
Number of words		:	 52858
Number of unique words	:	 13293
Sample extract		:	 de-a lungul vieții watson a fost interesat...

Language: Turkish
-----------------------
Number of sentences	:	 1000
Number of words		:	 49914
Number of unique words	:	 18199
Sample extract		:	 tsutinalar i̇ng

In [73]:
def preprocess(text):   
    preprocessed_text = text.lower().replace('-', ' ')
    
    translation_table = str.maketrans('\n', ' ', string.punctuation+string.digits)
    
    preprocessed_text = preprocessed_text.translate(translation_table)
        
    return preprocessed_text

In [74]:
data_preprocessed = {k: [preprocess(sentence) for sentence in v] for k, v in data_raw.items()}

In [75]:
show_statistics(data_preprocessed)

Language: Tamil
-----------------------
Number of sentences	:	 1000
Number of words		:	 38627
Number of unique words	:	 17078
Sample extract		:	 விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திரிகை விசாகப்பட்டின ஆசிரியர் சம்பத்துடன்...

Language: Persian
-----------------------
Number of sentences	:	 1000
Number of words		:	 77548
Number of unique words	:	 12567
Sample extract		:	 آهن ترکیباتی را ایجاد می‌کند که عمدتاً...

Language: Latin
-----------------------
Number of sentences	:	 1000
Number of words		:	 42252
Number of unique words	:	 14538
Sample extract		:	 müller mox figura centralis circulorum doctorum vindobonesium...

Language: Romanian
-----------------------
Number of sentences	:	 1000
Number of words		:	 53662
Number of unique words	:	 13088
Sample extract		:	 de a lungul vieții watson a fost...

Language: Turkish
-----------------------
Number of sentences	:	 1000
Number of words		:	 49978
Number of unique words	:	 17933
Sample extract		:	 tsutinalar i̇ngilizce tsuutina ka

In [76]:
sentences_train, y_train = [], []

for k, v in data_preprocessed.items():
    for sentence in v:
        sentences_train.append(sentence)
        y_train.append(k)

In [77]:
vectorizer = CountVectorizer()

In [78]:
X_train = vectorizer.fit_transform(sentences_train)

In [79]:
naive_classifier = MultinomialNB(alpha=0.0001,fit_prior=False)
naive_classifier.fit(X_train, y_train)

MultinomialNB(alpha=0.0001, class_prior=None, fit_prior=False)

In [80]:
text = 'मेरा नाम है श्रेयश देशपांडे '
text = preprocess(text)
text = [text]
text_v = vectorizer.transform(text)
print(naive_classifier.predict(text_v))

['Hindi']
