In [None]:
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from textblob import TextBlob
import csv

In [None]:
df=pd.read_csv("Bookf.csv") # Bookf is the mixed dataset resulting from YouTube and Reddit datasets

In [None]:
def count_words(text):
    words = re.findall(r'\w+', text)
    return len(words)
df['word_count'] = df['text'].apply(count_words)

In [None]:
punctuation = string.punctuation
df['punctuation_count'] = df['text'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation)))

In [None]:
df=df.drop('length', axis=1)

In [None]:
def get_text_lengths(text_column):
    return text_column.apply(len)

# Apply the function to the text column
df['text_length'] = get_text_lengths(df['text'])

In [None]:
import nltk
from nltk.corpus import stopwords
import textblob
from textblob import Word
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)
def without_leading_trailing_whitespace(text):
  return text.strip()
def lowercase(text):
  return text.lower()
def remove_special_characters(text):
  pat = r'[^a-zA-z0-9]'
  return re.sub(pat, ' ', text)
def special_text(text):
  sentences = re.split(r'\.\s', text)
  sentences = [sentence for sentence in sentences if not sentence.startswith(">")]
  cleaned_text = ". ".join(sentences)
  return cleaned_text
def no_http_links(text):
  link_regex = r'http\S+'
  text = re.sub(link_regex, " ", text)
  return text
def no_multi_punctuation(text):
  pattern = r"\!+"
  text = re.sub(pattern, "!", text)
  pattern = r"\?+"
  text = re.sub(pattern, "?", text)
  pattern = r"\.+"
  text = re.sub(pattern, ".", text)
  return text
def no_hash(text):
  return re.sub(r'[\#+]', " ", text)
def no_number(text):
  text = re.sub('([0-9]+)', '', str(text))
  return text
def lem(text):
  lemwords=[]
  for word in text.split():
    word=Word(word).lemmatize()
    lemwords.append(word)
  return " ".join(lemwords)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def preprocess(text):
  text= no_hash(text)
  text= no_http_links(text)
  text= without_leading_trailing_whitespace(text)
  text= lowercase(text)
  text= no_multi_punctuation(text)
  text= remove_special_characters(text)
  text= no_number(text)
  text= remove_stopwords(text)
  text= lem(text)
  return text

In [None]:
df['cleaned_text']=df['text'].apply(preprocess)

In [None]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m92.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m42.9 MB/s[0m eta [36m0:00:0

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from transformers import RobertaTokenizer, RobertaModel


# Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Split data into features (X) and target (y)
X = df['cleaned_text']
y = df['supportive']

# Split data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Tokenize and encode text data using RoBERTa tokenizer
X_train_encoded = tokenizer(X_train.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128)
X_val_encoded = tokenizer(X_val.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128)
X_test_encoded = tokenizer(X_test.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128)

# Pass the encoded data through RoBERTa model to obtain embeddings
with torch.no_grad():
    X_train_embeddings = model(**X_train_encoded).last_hidden_state
    X_val_embeddings = model(**X_val_encoded).last_hidden_state
    X_test_embeddings = model(**X_test_encoded).last_hidden_state

# Flatten the embeddings
X_train_flattened = X_train_embeddings.view(X_train_embeddings.size(0), -1).numpy()
X_val_flattened = X_val_embeddings.view(X_val_embeddings.size(0), -1).numpy()
X_test_flattened = X_test_embeddings.view(X_test_embeddings.size(0), -1).numpy()

# Initialize SVM classifier with a linear kernel
svm_classifier = SVC(kernel='linear', C=1.0)

# Fit the model on the training set
svm_classifier.fit(X_train_flattened, y_train)

# Make predictions on the validation set
y_val_pred = svm_classifier.predict(X_val_flattened)

# Evaluate the model on the validation set
val_report = classification_report(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)

print("Validation Set - Classification Report:")
print(val_report)

print("\nValidation Set - Confusion Matrix:")
print(val_conf_matrix)

# Make predictions on the test set
y_test_pred = svm_classifier.predict(X_test_flattened)

# Evaluate the model on the test set
test_report = classification_report(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)

print("\nTest Set - Classification Report:")
print(test_report)

print("\nTest Set - Confusion Matrix:")
print(test_conf_matrix)


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Validation Set - Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.79      0.81       670
           1       0.38      0.44      0.41       196

    accuracy                           0.71       866
   macro avg       0.60      0.61      0.61       866
weighted avg       0.73      0.71      0.72       866


Validation Set - Confusion Matrix:
[[530 140]
 [110  86]]

Test Set - Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.82      0.82       671
           1       0.40      0.41      0.41       196

    accuracy                           0.73       867
   macro avg       0.61      0.62      0.61       867
weighted avg       0.73      0.73      0.73       867


Test Set - Confusion Matrix:
[[549 122]
 [115  81]]
