<a href="https://colab.research.google.com/github/samservo09/thesis-svm-tele-triage/blob/main/%5B500_dataset%5D_SVM_simulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import time

# Download necessary NLTK packages (ensure this only runs once)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Downloading the dataset
!wget https://raw.githubusercontent.com/samservo09/thesis-svm-tele-triage/refs/heads/main/data/500_Reddit_users_posts_labels.csv

# Load the data
Corpus = pd.read_csv("500_Reddit_users_posts_labels.csv", encoding='latin-1')
Corpus.columns = Corpus.columns.str.lower()  # rename columns to lowercase

# Splitting features (post) and labels (label)
X = Corpus['post']
y = Corpus['label']

# Label Encoding
label_mapping = {'Behavior': 0, 'Supportive': 1, 'Indicator': 2, 'Attempt': 3, 'Ideation': 4}
y = y.map(label_mapping)

# Cleaning the Text Data
cleanedData = []
lemma = WordNetLemmatizer()
stemmer = PorterStemmer()
swords = stopwords.words("english")

for text in X:
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub("[^a-zA-Z0-9]", " ", text)  # Remove non-alphanumeric characters
    text = nltk.word_tokenize(text.lower())  # Tokenize and lowercase
    text = [lemma.lemmatize(word) for word in text]  # Lemmatize words
    text = [stemmer.stem(word) for word in text]  # Stem words
    text = [word for word in text if word not in swords]  # Remove stopwords
    cleanedData.append(" ".join(text))

# Vectorizing the Text Data using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)
BOW = vectorizer.fit_transform(cleanedData)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(BOW, y, test_size=0.2, random_state=42)

# Handling Class Imbalance using Class Weights
model = SVC(class_weight='balanced')

# Hyperparameter Tuning using Grid Search
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}
grid = GridSearchCV(model, param_grid, refit=True, verbose=2, cv=3)

# Train the SVM Model
start_time = time.time()
grid.fit(x_train, y_train)
end_time = time.time()

process_time = round(end_time - start_time, 2)
print("Fitting GridSearchCV took {} seconds".format(process_time))

# Best Estimator
best_model = grid.best_estimator_
print("Best Estimator Parameters:", grid.best_params_)

# Predict the labels for the test set
predictions = best_model.predict(x_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, predictions)
print("Improved Accuracy of the model is {}%".format(accuracy * 100))

# Example usage of the optimized model
sample_text = ["This is a test post about feeling stressed and needing support."]
sample_text_cleaned = []

for text in sample_text:
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub("[^a-zA-Z0-9]", " ", text)  # Remove non-alphanumeric characters
    text = nltk.word_tokenize(text.lower())  # Tokenize and lowercase
    text = [lemma.lemmatize(word) for word in text]  # Lemmatize words
    text = [stemmer.stem(word) for word in text]  # Stem words
    text = [word for word in text if word not in swords]  # Remove stopwords
    sample_text_cleaned.append(" ".join(text))

sample_text_vectorized = vectorizer.transform(sample_text_cleaned)
sample_prediction = best_model.predict(sample_text_vectorized)
label_mapping_reverse = {v: k for k, v in label_mapping.items()}
predicted_label = label_mapping_reverse[sample_prediction[0]]
print(f"Predicted label for sample text: {predicted_label}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


--2024-11-20 14:23:33--  https://raw.githubusercontent.com/samservo09/thesis-svm-tele-triage/refs/heads/main/data/500_Reddit_users_posts_labels.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3622335 (3.5M) [text/plain]
Saving to: ‘500_Reddit_users_posts_labels.csv’


2024-11-20 14:23:33 (74.2 MB/s) - ‘500_Reddit_users_posts_labels.csv’ saved [3622335/3622335]

Fitting 3 folds for each of 64 candidates, totalling 192 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.5s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.6s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.5s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.6s
[CV] END .....

In [2]:
# classification report
from sklearn.metrics import classification_report

# Assuming 'predictions' and 'y_test' are defined from your previous code
report = classification_report(y_test, predictions, target_names=label_mapping.keys())
print(report)

              precision    recall  f1-score   support

    Behavior       0.25      0.13      0.17        15
  Supportive       0.55      0.43      0.48        28
   Indicator       0.35      0.32      0.33        19
     Attempt       0.00      0.00      0.00        13
    Ideation       0.26      0.56      0.36        25

    accuracy                           0.34       100
   macro avg       0.28      0.29      0.27       100
weighted avg       0.32      0.34      0.31       100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
