In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [3]:
df = pd.read_csv('languageD.csv') 

In [3]:
df

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada


In [4]:
X = dataset['Text']
y = dataset['Language']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [6]:
svm_classifier = SVC(kernel='linear') 


In [7]:
svm_classifier.fit(X_train_tfidf, y_train)

SVC(kernel='linear')

In [25]:
import joblib

joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

joblib.dump(svm_classifier, 'svm_model.pkl')


['svm_model.pkl']

In [9]:
y_pred = svm_classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9637330754352031

Classification Report:
              precision    recall  f1-score   support

      Arabic       1.00      0.95      0.98       106
      Danish       0.96      0.90      0.93        73
       Dutch       0.98      0.95      0.96       111
     English       0.98      0.99      0.98       291
      French       1.00      0.97      0.98       219
      German       1.00      0.95      0.97        93
       Greek       1.00      0.94      0.97        68
       Hindi       1.00      0.80      0.89        10
     Italian       1.00      0.94      0.97       145
     Kannada       1.00      0.98      0.99        66
   Malayalam       1.00      0.97      0.98       121
  Portugeese       1.00      0.95      0.98       144
     Russian       0.74      1.00      0.85       136
     Spanish       0.93      0.97      0.95       160
    Sweedish       0.98      0.98      0.98       133
       Tamil       1.00      0.97      0.98        87
     Turkish       1.00     

In [10]:
new_text = ["This is a test text.", "Ceci est un texte de test.", "Esto es un texto de prueba."]
new_text_tfidf = tfidf_vectorizer.transform(new_text)
predicted_languages = svm_classifier.predict(new_text_tfidf)
print("\nPredicted languages for new text inputs:", predicted_languages)


Predicted languages for new text inputs: ['English' 'French' 'Spanish']


In [11]:
svm_classifier_poly = SVC(kernel='poly', degree=5)  
svm_classifier_poly.fit(X_train_tfidf, y_train)

SVC(degree=5, kernel='poly')

In [12]:
svm_classifier_poly.fit(X_train_tfidf, y_train)

SVC(degree=5, kernel='poly')

In [13]:
y_pred_poly = svm_classifier_poly.predict(X_test_tfidf)
accuracy_poly = accuracy_score(y_test, y_pred_poly)
print("Accuracy:", accuracy_poly)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_poly))

Accuracy: 0.17988394584139264

Classification Report:
              precision    recall  f1-score   support

      Arabic       1.00      0.01      0.02       106
      Danish       0.75      0.04      0.08        73
       Dutch       1.00      0.04      0.07       111
     English       0.15      1.00      0.26       291
      French       1.00      0.05      0.10       219
      German       1.00      0.03      0.06        93
       Greek       1.00      0.06      0.11        68
       Hindi       1.00      0.20      0.33        10
     Italian       1.00      0.02      0.04       145
     Kannada       1.00      0.08      0.14        66
   Malayalam       1.00      0.11      0.19       121
  Portugeese       1.00      0.03      0.05       144
     Russian       1.00      0.03      0.06       136
     Spanish       1.00      0.01      0.02       160
    Sweedish       1.00      0.02      0.03       133
       Tamil       1.00      0.21      0.34        87
     Turkish       1.00    

In [14]:
svm_classifier_rbf = SVC(kernel='rbf', gamma='auto')  # 'gamma' parameter controls the influence of a single training example
svm_classifier_rbf.fit(X_train_tfidf, y_train)


SVC(gamma='auto')

In [15]:
svm_classifier_rbf.fit(X_train_tfidf, y_train)

SVC(gamma='auto')

In [16]:
y_pred_rbf = svm_classifier_rbf.predict(X_test_tfidf)
accuracy_rbf = accuracy_score(y_test, y_pred_rbf)
print("Accuracy:", accuracy_rbf)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rbf))

Accuracy: 0.140715667311412

Classification Report:
              precision    recall  f1-score   support

      Arabic       0.00      0.00      0.00       106
      Danish       0.00      0.00      0.00        73
       Dutch       0.00      0.00      0.00       111
     English       0.14      1.00      0.25       291
      French       0.00      0.00      0.00       219
      German       0.00      0.00      0.00        93
       Greek       0.00      0.00      0.00        68
       Hindi       0.00      0.00      0.00        10
     Italian       0.00      0.00      0.00       145
     Kannada       0.00      0.00      0.00        66
   Malayalam       0.00      0.00      0.00       121
  Portugeese       0.00      0.00      0.00       144
     Russian       0.00      0.00      0.00       136
     Spanish       0.00      0.00      0.00       160
    Sweedish       0.00      0.00      0.00       133
       Tamil       0.00      0.00      0.00        87
     Turkish       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
svm_classifier_sigmoid = SVC(kernel='sigmoid', gamma='auto')  # 'gamma' parameter controls the influence of a single training example
svm_classifier_sigmoid.fit(X_train_tfidf, y_train)


SVC(gamma='auto', kernel='sigmoid')

In [18]:
svm_classifier_sigmoid.fit(X_train_tfidf, y_train)

SVC(gamma='auto', kernel='sigmoid')

In [19]:
y_pred_sigmoid = svm_classifier_sigmoid.predict(X_test_tfidf)
accuracy_sigmoid = accuracy_score(y_test, y_pred_sigmoid)
print("Accuracy:", accuracy_sigmoid)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_sigmoid))

Accuracy: 0.140715667311412

Classification Report:
              precision    recall  f1-score   support

      Arabic       0.00      0.00      0.00       106
      Danish       0.00      0.00      0.00        73
       Dutch       0.00      0.00      0.00       111
     English       0.14      1.00      0.25       291
      French       0.00      0.00      0.00       219
      German       0.00      0.00      0.00        93
       Greek       0.00      0.00      0.00        68
       Hindi       0.00      0.00      0.00        10
     Italian       0.00      0.00      0.00       145
     Kannada       0.00      0.00      0.00        66
   Malayalam       0.00      0.00      0.00       121
  Portugeese       0.00      0.00      0.00       144
     Russian       0.00      0.00      0.00       136
     Spanish       0.00      0.00      0.00       160
    Sweedish       0.00      0.00      0.00       133
       Tamil       0.00      0.00      0.00        87
     Turkish       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
