In [None]:
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from textblob import TextBlob
import csv

In [None]:
df=pd.read_csv("Bookf.csv") # Bookf is the mixed dataset resulting from YouTube and Reddit datasets

In [None]:
def count_words(text):
    words = re.findall(r'\w+', text)
    return len(words)
df['word_count'] = df['text'].apply(count_words)

In [None]:
punctuation = string.punctuation
df['punctuation_count'] = df['text'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation)))

In [None]:
df=df.drop('length', axis=1)

In [None]:
def get_text_lengths(text_column):
    return text_column.apply(len)

df['text_length'] = get_text_lengths(df['text'])

In [None]:
import nltk
from nltk.corpus import stopwords
import textblob
from textblob import Word
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)
def without_leading_trailing_whitespace(text):
  return text.strip()
def lowercase(text):
  return text.lower()
def remove_special_characters(text):
  pat = r'[^a-zA-z0-9]'
  return re.sub(pat, ' ', text)
def special_text(text):
  sentences = re.split(r'\.\s', text)
  sentences = [sentence for sentence in sentences if not sentence.startswith(">")]
  cleaned_text = ". ".join(sentences)
  return cleaned_text
def no_http_links(text):
  link_regex = r'http\S+'
  text = re.sub(link_regex, " ", text)
  return text
def no_multi_punctuation(text):
  pattern = r"\!+"
  text = re.sub(pattern, "!", text)
  pattern = r"\?+"
  text = re.sub(pattern, "?", text)
  pattern = r"\.+"
  text = re.sub(pattern, ".", text)
  return text
def no_hash(text):
  return re.sub(r'[\#+]', " ", text)
def no_number(text):
  text = re.sub('([0-9]+)', '', str(text))
  return text
def lem(text):
  lemwords=[]
  for word in text.split():
    word=Word(word).lemmatize()
    lemwords.append(word)
  return " ".join(lemwords)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def preprocess(text):
  text= no_hash(text)
  text= no_http_links(text)
  text= without_leading_trailing_whitespace(text)
  text= lowercase(text)
  text= no_multi_punctuation(text)
  text= remove_special_characters(text)
  text= no_number(text)
  text= remove_stopwords(text)
  text= lem(text)
  return text

In [None]:
df['cleaned_text']=df['text'].apply(preprocess)

In [None]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Split data into features (X) and target (y)
X = df['cleaned_text']
y = df['supportive']

# Split data into training (70%), validation (15%), and testing (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert text data to numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize SVM classifier
svm_classifier = SVC(kernel='linear', C=1.0)

# Fit the model on the training set
svm_classifier.fit(X_train_tfidf, y_train)


# Make predictions on the validation set
y_val_pred = svm_classifier.predict(X_val_tfidf)

# Evaluate the model on the validation set
val_report = classification_report(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)

print("Validation Set - Classification Report:")
print(val_report)

print("\nValidation Set - Confusion Matrix:")
print(val_conf_matrix)

# Make predictions on the test set
y_test_pred = svm_classifier.predict(X_test_tfidf)

# Evaluate the model on the test set
test_report = classification_report(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)

print("\nTest Set - Classification Report:")
print(test_report)

print("\nTest Set - Confusion Matrix:")
print(test_conf_matrix)


Validation Set - Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.94      0.88       670
           1       0.63      0.34      0.44       196

    accuracy                           0.81       866
   macro avg       0.73      0.64      0.66       866
weighted avg       0.79      0.81      0.78       866


Validation Set - Confusion Matrix:
[[631  39]
 [129  67]]

Test Set - Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.94      0.88       671
           1       0.59      0.30      0.39       196

    accuracy                           0.79       867
   macro avg       0.71      0.62      0.64       867
weighted avg       0.77      0.79      0.77       867


Test Set - Confusion Matrix:
[[631  40]
 [138  58]]
