In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
file_path = '/content/drive/MyDrive/processed_train_data_telugu1.csv'
data = pd.read_csv(file_path)

In [None]:
data.head()

Unnamed: 0,Text,Label
0,చీ నీయమ్మ జీవితం ఎవడో వేడిబుర్రపైన డొమెక్స్ ఇచ...,C
1,లైక్ కొట్టమని అడిగానా నచ్చితే లైక్ కొట్టండి లే...,C
2,చాలా ఇంపార్టెంట్ పాయింట్ ఏంటి అంటే మీరు లోపల ఎ...,G
3,గుద్ద ముయ్యి నువ్వు లంజ అంత చేసిదెంగి మళ్ళా మా...,C
4,భగవంతుడు పైన కాదు ఎక్కడో ఇక్కడ ఉన్నాడు అక్కడ ఉ...,N


In [None]:
X = data['Text']
y = data['Label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3), stop_words='english')

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
logistic_params = {'C': [0.1, 1, 10], 'penalty': ['l2']}
logistic_model = GridSearchCV(LogisticRegression(solver='liblinear', class_weight='balanced'), logistic_params, cv=5, scoring='f1_macro')
logistic_model.fit(X_train_vectorized, y_train)

In [None]:
tree_params = {'max_depth': [20, 30, None], 'min_samples_split': [2, 5, 10]}
tree_model = GridSearchCV(DecisionTreeClassifier(class_weight='balanced'), tree_params, cv=5, scoring='f1_macro')
tree_model.fit(X_train_vectorized, y_train)

In [None]:
forest_params = {'n_estimators': [300, 400], 'max_depth': [30, 40, None], 'min_samples_split': [3, 6]}
forest_model = GridSearchCV(RandomForestClassifier(class_weight='balanced'), forest_params, cv=5, scoring='f1_macro')
forest_model.fit(X_train_vectorized, y_train)

In [None]:
models = {
    "Logistic Regression": logistic_model.best_estimator_,
    "Decision Tree": tree_model.best_estimator_,
    "Random Forest": forest_model.best_estimator_,
}

results = []

In [None]:
for model_name, model in models.items():
    # Predictions
    y_pred = model.predict(X_test_vectorized)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    class_report = classification_report(y_test, y_pred, output_dict=True)

    # Store results
    results.append([model_name, accuracy, macro_f1])

    # Classification report and heatmap
    print(f"\nClassification Report for {model_name}:\n")
    print(classification_report(y_test, y_pred))

    # Confusion matrix heatmap
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y), yticklabels=np.unique(y))
    plt.title(f"Confusion Matrix for {model_name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

In [None]:
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Macro F1 Score'])
print("\nModel Performance Comparison:\n")
print(results_df)

In [None]:
test_file_path = '/content/drive/MyDrive/processed_test_data_telug.csv'
test_data = pd.read_csv(test_file_path)

X_test_final = vectorizer.transform(test_data['Transcript'])

test_data['Logistic Regression'] = models["Logistic Regression"].predict(X_test_final)
test_data['Decision Tree'] = models["Decision Tree"].predict(X_test_final)
test_data['Random Forest'] = models["Random Forest"].predict(X_test_final)

output_csv_path = '/content/drive/MyDrive/base_tfidf_predictions.csv'
test_data.to_csv(output_csv_path, index=False, encoding='utf-8')

print(f"Predictions saved to {output_csv_path}")