In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
import pickle
import logging
logging.basicConfig(level = logging.INFO, filename = 'lang_detect_notebook.log', filemode = 'w', format = '%(asctime)s - %(levelname)s - %(message)s')

logging.info('All libraries exported')

In [18]:
try:
    df = pd.read_csv('../dataset/Language Detection.csv')
    x = df.Text
    y = df.Language
    logging.info('Read CSV file, Dependent/independent features extracted')
    pickle.dump(y, open('../pickle_exports/y.pkl', 'wb'))
except:
    print('CSV file not found')
    logging.error('File not found')

In [3]:
try:
    le = LabelEncoder()
    encoded_y = le.fit_transform(y)
    logging.info('Dependent variable encoded and pickle file exported')
except:
    print('Encoding failed')
    logging.error('Encoding failed')

In [4]:
try:
    data = []
    for text in x:
        text = re.sub(r'[!@#$(),.n"%^*?:;~`0-9\n]', '', text)
        text = text.lower()
        data.append(text)
    logging.info('Data cleaning done')
except:
    logging.error('Data cleaning aborted unexpectedly')

In [14]:
try:
    pickle.dump(data, open('../pickle_exports/data.pkl','wb'))
    logging.info('cleaned data exported successfully')
except:
    print('File not found')
    logging.error('Cleaned data file not found')

In [6]:
try:
    cv = CountVectorizer()
    x_vector = cv.fit_transform(data).toarray()
    logging.info('Data vectorized and converted to array')
except:
    print('Vectorization failed')
    logging.error('Vectorization failed')

In [7]:
try:
    xtrain, xtest, ytrain, ytest = train_test_split(x_vector, encoded_y, train_size=0.8, random_state = 12)
    logging.info('Train/Test data split')
except:
    print('Dataset split failed')
    logging.error('Dataset split failed')

In [8]:
try:
    naive_model = MultinomialNB()
    naive_model.fit(xtrain, ytrain)
    logging.info('Model trained successfully')
except:
    print('Model training failed')
    logging.error('Model training failed')

In [9]:
try:
    ypred = naive_model.predict(xtest)
    precision, recall, f_beta, support = precision_recall_fscore_support(ytest, ypred, beta=2, pos_label=1, average='weighted', zero_division=1)
    accuracy = accuracy_score(ytest, ypred)
    print(f"Accuracy is: {accuracy:.2f}")
    print(f"Precision is: {precision:.2f}")
    print(f"Recall is: {recall:.2f}")
    print(f"Fscore is: {f_beta:.2f}")
    logging.info('accuracy calculated')
except:
    print('Accuracy calculation failed')
    logging.error('Accuracy calculation failed')

Accuracy is: 0.97
Precision is: 0.97
Recall is: 0.97
Fscore is: 0.97


In [11]:
try:
    def prediction(text):
        x = cv.transform([text]).toarray()
        lang = naive_model.predict(x)
        lang = le.inverse_transform(lang)
        print('Naive Bayes Prediction : ',lang[0])
        logging.info('Prediction function executed and result printed')
except:
    print('Prediction module failed')
    logging.error('Prediction module failed')

In [12]:
prediction('أحتاج إلى شراء دراجة هوندا بعد حصولي على وظيفة')
prediction("это портал знаний на базе сообщества для профессионалов в области аналитики и данных.")
prediction("வணக்கம் ச மாப்ள")
prediction("എനിക്ക് ജോലി കിട്ടിയതിന് ശേഷം എനിക്ക് ഒരു ഹോണ്ട ബൈക്ക് വാങ്ങണം")
prediction("ನಾನು ಕೆಲಸ ಪಡೆದ ನಂತರ ನಾನು ಹೋಂಡಾ ಬೈಕ್ ಖರೀದಿಸಬೇಕಾಗಿದೆ")
prediction("ik moet een honda-fiets kopen nadat ik een baan heb gekregen")
prediction("necesito comprar una bicicleta honda después de conseguir un trabajo")
prediction("Jag måste köpa en hondacykel efter att jag fått jobb")
prediction("Ich muss mir ein Honda-Motorrad kaufen, nachdem ich einen Job bekommen habe")
prediction("Πρέπει να αγοράσω ένα ποδήλατο honda αφού βρω δουλειά")
prediction("eu preciso comprar uma moto honda depois que eu conseguir um emprego")

logging.info('Prediction function called for sample data')

Naive Bayes Prediction :  Arabic
Naive Bayes Prediction :  Russian
Naive Bayes Prediction :  Tamil
Naive Bayes Prediction :  Malayalam
Naive Bayes Prediction :  Kannada
Naive Bayes Prediction :  Dutch
Naive Bayes Prediction :  Spanish
Naive Bayes Prediction :  Sweedish
Naive Bayes Prediction :  German
Naive Bayes Prediction :  Greek
Naive Bayes Prediction :  Portugeese


In [13]:
try:
    import pickle  
    pickle.dump(naive_model, open('../pickle_exports/naive_lang_detect_model1.pkl','wb'))
    logging.info('Model pickle file exported')
except:
    print('Model pickle file import failed')
    logging.error('Model pickle file import failed')