In [1]:
import pandas as pd
import numpy as np
import re
import time
import os
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, classification_report, roc_auc_score, roc_curve, auc)

# ==========================
# 1Ô∏è‚É£ Create output folder
# ==========================
output_dir = "model_outputs"
os.makedirs(output_dir, exist_ok=True)

# ==========================
# 2Ô∏è‚É£ Load Dataset
# ==========================
df = pd.read_csv("/kaggle/input/bangla-english-banglish-for-language-indentify/language_detection_dataset_cleaned.csv")
df = df.dropna(subset=['text','language'])

# ==========================
# 3Ô∏è‚É£ Preprocess Text
# ==========================
def preprocess_text(text):
    text = str(text).strip().lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['text'] = df['text'].apply(preprocess_text)

# ==========================
# 4Ô∏è‚É£ Encode Labels
# ==========================
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['language'])
classes = label_encoder.classes_

# ==========================
# 5Ô∏è‚É£ Vectorize Text
# ==========================
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = vectorizer.fit_transform(df['text'])
y = df['encoded_label']

# ==========================
# 6Ô∏è‚É£ Train/Test Split
# ==========================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# ==========================
# 7Ô∏è‚É£ Train Random Forest
# ==========================
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
start_train = time.time()
rf.fit(X_train, y_train)
end_train = time.time()
train_time = end_train - start_train

# ==========================
# 8Ô∏è‚É£ Predictions
# ==========================
start_infer = time.time()
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)
end_infer = time.time()
inference_time_total = end_infer - start_infer
inference_time_per_sample = inference_time_total / X_test.shape[0]

# ==========================
# 9Ô∏è‚É£ Metrics
# ==========================
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
f1_macro = f1_score(y_test, y_pred, average='macro')
precision_weighted = precision_score(y_test, y_pred, average='weighted')
recall_weighted = recall_score(y_test, y_pred, average='weighted')
f1_weighted = f1_score(y_test, y_pred, average='weighted')

report_dict = {
    "accuracy": accuracy,
    "precision_macro": precision_macro,
    "recall_macro": recall_macro,
    "f1_macro": f1_macro,
    "precision_weighted": precision_weighted,
    "recall_weighted": recall_weighted,
    "f1_weighted": f1_weighted,
    "train_time_sec": train_time,
    "total_inference_time_sec": inference_time_total,
    "inference_time_per_sample_sec": inference_time_per_sample
}

# Save metrics
metrics_file = os.path.join(output_dir, "metrics.json")
import json
with open(metrics_file, "w") as f:
    json.dump(report_dict, f, indent=4)
print(f"üíæ Metrics saved to {metrics_file}")

# ==========================
# 10Ô∏è‚É£ Save model, vectorizer & encoder
# ==========================
joblib.dump(rf, os.path.join(output_dir, "random_forest_model.pkl"))
joblib.dump(vectorizer, os.path.join(output_dir, "tfidf_vectorizer.pkl"))
joblib.dump(label_encoder, os.path.join(output_dir, "label_encoder.pkl"))
print(f"üíæ Model, Vectorizer & Label Encoder saved to {output_dir}")

# ==========================
# 11Ô∏è‚É£ Save confusion matrix plot
# ==========================
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=classes, yticklabels=classes, cmap="Blues")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.savefig(os.path.join(output_dir, "confusion_matrix.png"))
plt.close()
print(f"üíæ Confusion matrix plot saved to {output_dir}")

# ==========================
# 12Ô∏è‚É£ Save feature importance plot
# ==========================
importances = rf.feature_importances_
indices = np.argsort(importances)[-20:]
features = np.array(vectorizer.get_feature_names_out())[indices]
plt.figure(figsize=(10,6))
plt.barh(features, importances[indices])
plt.title("Top 20 TF-IDF Features")
plt.savefig(os.path.join(output_dir, "feature_importance.png"))
plt.close()
print(f"üíæ Feature importance plot saved to {output_dir}")


üíæ Metrics saved to model_outputs/metrics.json
üíæ Model, Vectorizer & Label Encoder saved to model_outputs
üíæ Confusion matrix plot saved to model_outputs
üíæ Feature importance plot saved to model_outputs


  plt.savefig(os.path.join(output_dir, "feature_importance.png"))
  plt.savefig(os.path.join(output_dir, "feature_importance.png"))
  plt.savefig(os.path.join(output_dir, "feature_importance.png"))
  plt.savefig(os.path.join(output_dir, "feature_importance.png"))
  plt.savefig(os.path.join(output_dir, "feature_importance.png"))
  plt.savefig(os.path.join(output_dir, "feature_importance.png"))
  plt.savefig(os.path.join(output_dir, "feature_importance.png"))
  plt.savefig(os.path.join(output_dir, "feature_importance.png"))
  plt.savefig(os.path.join(output_dir, "feature_importance.png"))
