# Important Modules

In [None]:
import pandas as pd
import string
import re
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Data Reading

In [None]:
df1 = pd.read_csv('/kaggle/input/language-detection/Language Detection.csv')
df2 = pd.read_csv('/kaggle/input/hindicsv/hindi.csv')

In [None]:
df1

In [None]:
df2

In [None]:
df = df1.append(df2,ignore_index=True)

In [None]:
df

# Initial Insights of Data

In [None]:
df.info()

In [None]:
df.Language.value_counts()

In [None]:
df[df.Language == 'Russian'].sample(2)

In [None]:
df[df.Language == 'Malayalam'].sample(2)

In [None]:
df[df.Language == 'Arabic'].sample(2)

In [None]:
df[df.Language == 'Tamil'].sample(2)

In [None]:
df[df.Language == 'Kannada'].sample(2)

In [None]:
df[df.Language == 'Hindi'].sample(2)

# Text Preprocessing

In [None]:
def removeSymbolsAndNumbers(text):        
        text = re.sub(r'[{}]'.format(string.punctuation), '', text)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[@]', '', text)

        return text.lower()

In [None]:
def removeEnglishLetters(text):        
        text = re.sub(r'[a-zA-Z]+', '', text)
        return text.lower()

In [None]:
X0 = df.apply(lambda x: removeEnglishLetters(x.Text) if x.Language in ['Russian','Malyalam','Hindi','Kannada','Tamil','Arabic']  else x.Text, axis = 1)
X0

In [None]:
X1 = X0.apply(removeSymbolsAndNumbers)
X1

In [None]:
y = df['Language']

# Making Model

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X1,y, random_state=42)

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,3), analyzer='char')

In [None]:
model = pipeline.Pipeline([
    ('vectorizer', vectorizer),
    ('clf', LogisticRegression())
])

In [None]:
model.fit(x_train,y_train)

# Checking for Accuracy 

In [None]:
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)

In [None]:
print("Accuracy is :",accuracy)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(cm, annot = True)
plt.show()

# Testing Model 

In [None]:
def predict(text):
    lang = model.predict([text])
    print('The Language is in',lang[0])


In [None]:
# English
predict("LANGUAGE DETECTION MODEL CHECK")
# French
predict("VÉRIFICATION DU MODÈLE DE DÉTECTION DE LA LANGUE")
# Arabic
predict("توففحص نموذج الكشف عن اللغة")
# Spanish
predict("VERIFICACIÓN DEL MODELO DE DETECCIÓN DE IDIOMAS")
# Malayalam
predict("ലാംഗ്വേജ് ഡിറ്റക്ഷൻ മോഡൽ ചെക്ക്")
# Russian
predict("ПРОВЕРКА МОДЕЛИ ОПРЕДЕЛЕНИЯ ЯЗЫКА")
# Hindi
predict('भाषा का पता लगाने वाले मॉडल की जांच')
# Hindi
predict(' boyit9h एनालिटिक्स alhgserog 90980879809 bguytfivb ahgseporiga प्रदान करता है')
