In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install catboost
!pip install xgboost
!pip install word2number

Collecting word2number
  Downloading word2number-1.1.zip (9.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5570 sha256=a17dcefc65d0ac2056b593070a0df6db86c355dc48af6cf45dfbe796d5b9b7f8
  Stored in directory: /root/.cache/pip/wheels/84/ff/26/d3cfbd971e96c5aa3737ecfced81628830d7359b55fbb8ca3b
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
import inflect
from word2number import w2n

In [None]:
# Load data
df_data = pd.read_csv('/content/drive/MyDrive/seq2seq/Final_mal_data.csv')


In [None]:
# Convert number words to numeric words
p = inflect.engine()
def number_to_words(number_text):
    try:
        number_value = w2n.word_to_num(number_text)
        return p.number_to_words(number_value)
    except ValueError:
        return number_text

df_data['WORD'] = df_data.apply(lambda row: number_to_words(row['WORD']) if row['LABEL'] == 'NUMBER' else row['WORD'], axis=1)


In [None]:
# Save preprocessed data
df_data.to_csv('/content/drive/MyDrive/seq2seq/preprocess_num_word.csv', index=False)

# Drop missing values
df_data = df_data.dropna()

In [None]:

# Split data
X_train, X_test, y_train, y_test = train_test_split(df_data['WORD'], df_data['LABEL'], test_size=0.30, random_state=42)

In [None]:
# Vectorize text data
vectorizer = TfidfVectorizer(ngram_range=(1, 4), analyzer='char')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


In [None]:
# Define models
models = {
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(solver='liblinear', multi_class='ovr'),
    'Naive Bayes': MultinomialNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=0),
    'KNN': KNeighborsClassifier(n_neighbors=3),
    'CatBoost': CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, loss_function='MultiClass', verbose=100),
    'XGBoost': xgb.XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_), random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=50, random_state=42)
}


In [None]:
# Train and evaluate models
for name, model in models.items():
    if name == 'CatBoost':
        model.fit(X_train_tfidf, y_train_encoded, eval_set=(X_test_tfidf, y_test_encoded), early_stopping_rounds=50)
        y_pred = model.predict(X_test_tfidf)
    else:
        model.fit(X_train_tfidf, y_train_encoded)
        y_pred = model.predict(X_test_tfidf)

    print(f"\n{name} Classification Report:\n")
    print(classification_report(y_test_encoded, y_pred, target_names=label_encoder.classes_))
    print("Accuracy:", accuracy_score(y_test_encoded, y_pred))


SVM Classification Report:

              precision    recall  f1-score   support

     ENGLISH       0.93      0.93      0.93      3131
   MALAYALAM       0.91      0.98      0.94      6459
       MIXED       0.78      0.42      0.54       427
        NAME       0.86      0.82      0.84      1015
      NUMBER       0.97      0.97      0.97       315
       OTHER       0.79      0.61      0.69      1189
       PLACE       0.82      0.39      0.53        59
         SYM       1.00      1.00      1.00      1580

    accuracy                           0.91     14175
   macro avg       0.88      0.76      0.80     14175
weighted avg       0.91      0.91      0.90     14175

Accuracy: 0.9099823633156966

Logistic Regression Classification Report:

              precision    recall  f1-score   support

     ENGLISH       0.89      0.92      0.91      3131
   MALAYALAM       0.89      0.98      0.93      6459
       MIXED       0.83      0.24      0.38       427
        NAME       0.83      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Decision Tree Classification Report:

              precision    recall  f1-score   support

     ENGLISH       0.91      0.91      0.91      3131
   MALAYALAM       0.92      0.94      0.93      6459
       MIXED       0.55      0.47      0.51       427
        NAME       0.81      0.81      0.81      1015
      NUMBER       0.94      0.91      0.93       315
       OTHER       0.67      0.63      0.65      1189
       PLACE       0.71      0.54      0.62        59
         SYM       1.00      1.00      1.00      1580

    accuracy                           0.89     14175
   macro avg       0.81      0.78      0.79     14175
weighted avg       0.89      0.89      0.89     14175

Accuracy: 0.8883950617283951

KNN Classification Report:

              precision    recall  f1-score   support

     ENGLISH       0.90      0.92      0.91      3131
   MALAYALAM       0.92      0.96      0.94      6459
       MIXED       0.63      0.33      0.43       427
        NAME       0.78      0.82  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Voting Classifier
voting = VotingClassifier(
    estimators=[
        ('Logistic Regression', models['Logistic Regression']),
        ('SVM', models['SVM']),
        ('Decision Tree', models['Decision Tree'])
    ],
    voting='hard'
)


In [None]:
voting.fit(X_train_tfidf, y_train_encoded)
voting_pred = voting.predict(X_test_tfidf)
print("\nVoting Classifier Classification Report:\n")
print(classification_report(y_test_encoded, voting_pred, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_test_encoded, voting_pred))


Voting Classifier Classification Report:

              precision    recall  f1-score   support

     ENGLISH       0.92      0.93      0.92      3131
   MALAYALAM       0.91      0.98      0.94      6459
       MIXED       0.79      0.38      0.52       427
        NAME       0.86      0.82      0.84      1015
      NUMBER       0.96      0.96      0.96       315
       OTHER       0.80      0.60      0.68      1189
       PLACE       0.85      0.39      0.53        59
         SYM       1.00      1.00      1.00      1580

    accuracy                           0.91     14175
   macro avg       0.89      0.76      0.80     14175
weighted avg       0.90      0.91      0.90     14175

Accuracy: 0.908289241622575


In [None]:
# Voting Classifier
voting = VotingClassifier(
    estimators=[
        ('XGBoost', models['XGBoost']),
        ('SVM', models['SVM']),
        ('KNN', models['KNN'])
    ],
    voting='hard'
)


In [None]:
voting.fit(X_train_tfidf, y_train_encoded)
voting_pred = voting.predict(X_test_tfidf)
print("\nVoting Classifier Classification Report:\n")
print(classification_report(y_test_encoded, voting_pred, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_test_encoded, voting_pred))