In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load your dataset
dataset = pd.read_excel('/content/ML dataset.xlsx')  # Update the file path accordingly

if 'Telugu' in dataset.columns and 'Hindi' in dataset.columns:
    X = dataset['Telugu']  # Telugu sentences
    y = dataset['Hindi']   # Hindi translations
else:
    # Fallback: Use the first column as Telugu and second column as Hindi
    X = dataset.iloc[:, 0]  # First column as Telugu
    y = dataset.iloc[:, 1]  # Second column as Hindi

X = X.fillna('')
y = y.fillna('')

# Check the distribution of classes
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Encode Hindi translations

class_counts = pd.Series(y_encoded).value_counts()
print("Class Distribution:\n", class_counts)

# Text preprocessing using TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)
X_transformed = tfidf.fit_transform(X).toarray()  # Transform Telugu text to feature vectors

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_encoded, test_size=0.2, random_state=42)

# Define hyperparameter grids for Perceptron and MLP
param_perceptron = {
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1.0]
}

param_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [1e-4, 1e-3, 1e-2],
    'learning_rate': ['constant', 'adaptive']
}

# Reduce number of splits (cv) to 2 for handling smaller class sizes
cv_splits = 2

# RandomizedSearchCV for Perceptron
rs_perceptron = RandomizedSearchCV(Perceptron(), param_distributions=param_perceptron, n_iter=10, cv=cv_splits, random_state=42)
rs_perceptron.fit(X_train, y_train)
y_pred_perceptron = rs_perceptron.predict(X_test)
accuracy_perceptron = accuracy_score(y_test, y_pred_perceptron)
print("Best Perceptron Parameters:", rs_perceptron.best_params_)
print("Perceptron Test Accuracy:", accuracy_perceptron)

# RandomizedSearchCV for MLP
rs_mlp = RandomizedSearchCV(MLPClassifier(), param_distributions=param_mlp, n_iter=10, cv=cv_splits, random_state=42)
rs_mlp.fit(X_train, y_train)
y_pred_mlp = rs_mlp.predict(X_test)
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
print("Best MLP Parameters:", rs_mlp.best_params_)
print("MLP Test Accuracy:", accuracy_mlp)


Class Distribution:
 819     2
0       2
370     2
538     2
827     2
       ..
406     1
791     1
222     1
1171    1
1112    1
Name: count, Length: 1193, dtype: int64




Best Perceptron Parameters: {'penalty': 'l2', 'alpha': 0.0001}
Perceptron Test Accuracy: 0.008333333333333333




Best MLP Parameters: {'solver': 'adam', 'learning_rate': 'constant', 'hidden_layer_sizes': (100,), 'alpha': 0.01, 'activation': 'relu'}
MLP Test Accuracy: 0.0125




In [14]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
# from catboost import CatBoostClassifier  # Comment this out if not installed
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Assuming X_train, X_test, y_train, y_test are already defined from the previous part

# Encode labels for training data
le_train = LabelEncoder()
y_train_encoded = le_train.fit_transform(y_train)

# Encode labels for test data, handle unseen labels in y_test
le_test = LabelEncoder()
le_test.classes_ = le_train.classes_  # Use same classes as training

try:
    y_test_encoded = le_test.transform(y_test)
except ValueError as e:
    print("Warning:", e)
    # Optional: Handle unseen labels in y_test by filtering them out
    valid_idx = [i for i, label in enumerate(y_test) if label in le_train.classes_]
    X_test = X_test[valid_idx]
    y_test = [y_test[i] for i in valid_idx]
    y_test_encoded = le_test.transform(y_test)

# Define classifiers to be evaluated
classifiers = {
    'SVC': SVC(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(),
    # 'CatBoost': CatBoostClassifier(verbose=0),  # Comment this out if not installed
    'NaiveBayes': MultinomialNB()
}

# Dictionary to store the results
results = {}

# Evaluate each classifier
for name, clf in classifiers.items():
    clf.fit(X_train, y_train_encoded)  # Train classifier with encoded labels
    y_pred = clf.predict(X_test)  # Predict on test data

    # Calculate performance metrics
    accuracy = accuracy_score(y_test_encoded, y_pred)
    precision = precision_score(y_test_encoded, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test_encoded, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test_encoded, y_pred, average='weighted', zero_division=1)

    # Store the results
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Convert the results dictionary into a DataFrame for better tabulation
results_df = pd.DataFrame(results).transpose()

# Print the results table
print(results_df)

# Optional: Save results to a CSV file
# results_df.to_csv('classifier_results.csv')






              Accuracy  Precision  Recall  F1 Score
SVC               0.50        1.0    0.50      0.50
DecisionTree      0.75        1.0    0.75      0.75
RandomForest      0.75        1.0    0.75      0.75
AdaBoost          0.00        1.0    0.00      0.00
XGBoost           0.00        1.0    0.00      0.00
NaiveBayes        0.00        1.0    0.00      0.00
