In [1]:
import pandas as pd

# Load your dataset
df = pd.read_csv('dataset.csv')

# Display basic info
print(df.info())
print(df.head())

# Check class distribution (languages)
print(df['language'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      22000 non-null  object
 1   language  22000 non-null  object
dtypes: object(2)
memory usage: 343.9+ KB
None
                                                Text  language
0  klement gottwaldi surnukeha palsameeriti ning ...  Estonian
1  sebes joseph pereira thomas  på eng the jesuit...   Swedish
2  ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...      Thai
3  விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...     Tamil
4  de spons behoort tot het geslacht haliclona en...     Dutch
language
Estonian      1000
Swedish       1000
Thai          1000
Tamil         1000
Dutch         1000
Japanese      1000
Turkish       1000
Latin         1000
Urdu          1000
Indonesian    1000
Portugese     1000
French        1000
Chinese       1000
Korean        1000
Hindi         1000
Spanish       1000
Pu

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [5]:
# Load dataset
df = pd.read_csv('dataset.csv')

In [6]:
# Prepare data - using your actual column names
X = df['Text']  # Note the capital 'T' in 'Text'
y = df['language']

In [7]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [10]:
# Create features
vectorizer = TfidfVectorizer(ngram_range=(1, 3),  # Using trigrams for better language patterns
                            max_features=15000,   # Increased features for better accuracy
                            analyzer='char_wb',   # Character n-grams work better for language detection
                            min_df=5)             # Ignore rare terms
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [11]:
# Train model - using increased max_iter and balanced class_weight
model = LogisticRegression(max_iter=1000,
                         class_weight='balanced',  # Important since you have many languages
                         solver='liblinear')      # Works well for text classification
model.fit(X_train_vec, y_train)

In [12]:
# Evaluate
y_pred = model.predict(X_test_vec)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy:", accuracy_score(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       200
     Chinese       0.99      0.99      0.99       200
       Dutch       0.97      0.98      0.98       200
     English       0.80      0.99      0.89       200
    Estonian       1.00      0.97      0.98       200
      French       0.97      0.99      0.98       200
       Hindi       1.00      0.96      0.98       200
  Indonesian       0.99      0.97      0.98       200
    Japanese       1.00      0.99      0.99       200
      Korean       1.00      0.99      0.99       200
       Latin       0.96      0.94      0.95       200
     Persian       0.99      0.99      0.99       200
   Portugese       0.99      0.96      0.98       200
      Pushto       1.00      0.95      0.97       200
    Romanian       1.00      0.98      0.99       200
     Russian       0.99      0.99      0.99       200
     Spanish       0.99      0.98      0.99       200
    

In [13]:
# Save model
joblib.dump(model, 'language_detection_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [14]:
# Create a prediction function
def detect_language(text):
    model = joblib.load('language_detection_model.pkl')
    vectorizer = joblib.load('tfidf_vectorizer.pkl')
    text_vec = vectorizer.transform([text])
    return model.predict(text_vec)[0]
    

In [15]:
# Test the function
test_samples = [
    "This is an English sentence",
    "Ceci est une phrase en français",
    "これは日本語の文章です",
    "Esta es una oración en español"
]

for sample in test_samples:
    print(f"'{sample[:30]}...' is in: {detect_language(sample)}")
    

'This is an English sentence...' is in: English
'Ceci est une phrase en françai...' is in: French
'これは日本語の文章です...' is in: Japanese
'Esta es una oración en español...' is in: Spanish
