In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
import nltk

# Download necessary NLTK resources 
nltk.download('stopwords')

# importing data
train_data = pd.read_excel("Dataset/training_dataset.xlsx")
val_data = pd.read_excel("Dataset/validation_dataset.xlsx")

# Text cleaning
non_content = ['um','uh','mhm','mm','oh','customer','agent','says']

# Function for removing ncw and trimming beginng and end from text
def remove_ncw_trim(raw_text, additional_noncontent_words):
    
    # Replace periods with spaces (if required)
    raw_text = raw_text.replace('.', ' ')
    
    # Define English stop words (from NLTK)
    stop_words = set(stopwords.words('english'))

    # Extend stop words with your custom non-content words
    stop_words.update(additional_noncontent_words)

    # Remove sequences of 'x's or '*'s
    raw_text = re.sub(r'[x*]+', '', raw_text)

    raw_text = re.sub(r'[^A-Za-z\s]', '', raw_text)

    # Split the text into words using whitespace
    uncut_words = raw_text.split()

    # Filter out the stop words and non-content words
    filtered_words = [word for word in uncut_words if word.lower() not in stop_words]
    
    # Join words back to string
    return ' '.join(filtered_words)


train_data['cleaned_text'] = train_data['text'].apply(lambda x: remove_ncw_trim(x, non_content))
val_data['cleaned_text'] = val_data['text'].apply(lambda x: remove_ncw_trim(x, non_content))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sahil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Initialize TF-IDF Vectorizer and transform the data
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['cleaned_text'])
X_validation_tfidf = tfidf_vectorizer.transform(val_data['cleaned_text'])

# Train a Logistic Regression classifier
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, train_data['label'])

# Predict the labels on the validation dataset
val_predictions = classifier.predict(X_validation_tfidf)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Calculate Precision, Recall, and F1 Score
precision = precision_score(val_data['label'], val_predictions, average='weighted')
recall = recall_score(val_data['label'], val_predictions, average='weighted')
f1_weighted = f1_score(val_data['label'], val_predictions, average='weighted')
f1_micro = f1_score(val_data['label'], val_predictions, average='micro')

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, Weighted F1 Score: {f1_weighted:.4f}, Micro F1 Score: {f1_micro:.4f}')

# Confusion Matrix
cm = confusion_matrix(val_data['label'], val_predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize classifiers
xgb_classifier = XGBClassifier()

# Create a label encoder object
le = LabelEncoder()

# Fit and transform the labels in the training and validation datasets
train_data['numeric_label'] = le.fit_transform(train_data['label'])
val_data['numeric_label'] = le.transform(val_data['label'])

# Now train the XGBoost classifier with the numeric labels
xgb_classifier.fit(X_train_tfidf, train_data['numeric_label'])
xgb_predictions = xgb_classifier.predict(X_validation_tfidf)

# If you need to reverse transform the predicted numeric labels back to string labels
xgb_predictions_labels = le.inverse_transform(xgb_predictions)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Predictions from the XGBoost classifier
xgb_predictions = xgb_classifier.predict(X_validation_tfidf)

# Convert the predictions back to the original label format
xgb_predictions_labels = le.inverse_transform(xgb_predictions)

# Calculate various performance metrics
precision = precision_score(val_data['label'], xgb_predictions_labels, average='weighted')
recall = recall_score(val_data['label'], xgb_predictions_labels, average='weighted')
f1_weighted = f1_score(val_data['label'], xgb_predictions_labels, average='weighted')
f1_micro = f1_score(val_data['label'], xgb_predictions_labels, average='micro')

# Print the metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'Weighted F1 Score: {f1_weighted:.4f}')
print(f'Micro F1 Score: {f1_micro:.4f}')

# Confusion Matrix
cm = confusion_matrix(val_data['label'], xgb_predictions_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Train the Logistic Regression classifier with numeric labels
lr_classifier_numeric = LogisticRegression()
lr_classifier_numeric.fit(X_train_tfidf, train_data['numeric_label'])

# Predict the labels on the validation dataset (numeric predictions)
val_predictions_numeric = lr_classifier_numeric.predict(X_validation_tfidf)

# Convert the numeric predictions back to original string labels
val_predictions_from_numeric = le.inverse_transform(val_predictions_numeric)

In [None]:
# Initialize metrics dictionary
metrics = {
    'Classifier': ['Logistic Regression', 'SVM', 'XGBoost'],
    'Precision': [],
    'Recall': [],
    'F1 Weighted': [],
    'F1 Micro': []
}

# List of predictions from each classifier
predictions_list = [val_predictions, svm_predictions_labels, xgb_predictions_labels]

# Loop over each set of predictions
for preds in predictions_list:
    metrics['Precision'].append(precision_score(val_data['label'], preds, average='weighted'))
    metrics['Recall'].append(recall_score(val_data['label'], preds, average='weighted'))
    metrics['F1 Weighted'].append(f1_score(val_data['label'], preds, average='weighted'))
    metrics['F1 Micro'].append(f1_score(val_data['label'], preds, average='micro'))

# Convert metrics to a DataFrame
metrics_df = pd.DataFrame(metrics)

# Display the DataFrame
print(metrics_df)
