In [None]:
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from textblob import TextBlob
import csv

In [None]:
df=pd.read_csv("Bookf.csv")

In [None]:
def count_words(text):
    words = re.findall(r'\w+', text)
    return len(words)
df['word_count'] = df['text'].apply(count_words)

In [None]:
punctuation = string.punctuation
df['punctuation_count'] = df['text'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation)))

In [None]:
df=df.drop('length', axis=1)

In [None]:
def get_text_lengths(text_column):
    return text_column.apply(len)

# Apply the function to the text column
df['text_length'] = get_text_lengths(df['text'])

In [None]:
import nltk
from nltk.corpus import stopwords
import textblob
from textblob import Word
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)
def without_leading_trailing_whitespace(text):
  return text.strip()
def lowercase(text):
  return text.lower()
def remove_special_characters(text):
  pat = r'[^a-zA-z0-9]'
  return re.sub(pat, ' ', text)
def special_text(text):
  sentences = re.split(r'\.\s', text)
  sentences = [sentence for sentence in sentences if not sentence.startswith(">")]
  cleaned_text = ". ".join(sentences)
  return cleaned_text
def no_http_links(text):
  link_regex = r'http\S+'
  text = re.sub(link_regex, " ", text)
  return text
def no_multi_punctuation(text):
  pattern = r"\!+"
  text = re.sub(pattern, "!", text)
  pattern = r"\?+"
  text = re.sub(pattern, "?", text)
  pattern = r"\.+"
  text = re.sub(pattern, ".", text)
  return text
def no_hash(text):
  return re.sub(r'[\#+]', " ", text)
def no_number(text):
  text = re.sub('([0-9]+)', '', str(text))
  return text
def lem(text):
  lemwords=[]
  for word in text.split():
    word=Word(word).lemmatize()
    lemwords.append(word)
  return " ".join(lemwords)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def preprocess(text):
  text= no_hash(text)
  text= no_http_links(text)
  text= without_leading_trailing_whitespace(text)
  text= lowercase(text)
  text= no_multi_punctuation(text)
  text= remove_special_characters(text)
  text= no_number(text)
  text= remove_stopwords(text)
  text= lem(text)
  return text

In [None]:
df['cleaned_text']=df['text'].apply(preprocess)

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m76.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m64.8 MB/s[0m eta [36m0:00:0

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm


# Initialize the RoBERTa model and tokenizer
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

# Prepare your data
X = df['cleaned_text'].tolist()
y = df['supportive'].tolist()

encoded_data = tokenizer(X, padding=True, truncation=True, return_tensors='pt')
attention_masks = encoded_data['attention_mask']

# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp, train_mask, temp_mask = train_test_split(
    encoded_data['input_ids'], y, attention_masks, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test, val_mask, test_mask = train_test_split(
    X_temp, y_temp, temp_mask, test_size=0.5, random_state=42)

# Create datasets and data loaders for train, validation, and test sets
train_dataset = torch.utils.data.TensorDataset(X_train, train_mask, torch.tensor(y_train))
val_dataset = torch.utils.data.TensorDataset(X_val, val_mask, torch.tensor(y_val))
test_dataset = torch.utils.data.TensorDataset(X_test, test_mask, torch.tensor(y_test))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

# Define optimizer and loss criterion
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()
epochs = 5

# Training loop
for epoch in range(epochs):
    model.train()
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False)
    for inputs, mask, labels in progress_bar:
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=mask)[0]
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix({'Loss': loss.item()})






Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Evaluation on the validation set
model.eval()
with torch.no_grad():
    all_preds = []
    for inputs, mask, labels in val_loader:
        outputs = model(inputs, attention_mask=mask)[0]
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.tolist())

    val_accuracy = accuracy_score(y_val, all_preds)
    val_precision = precision_score(y_val, all_preds)
    val_recall = recall_score(y_val, all_preds)
    val_f1 = f1_score(y_val, all_preds)
    val_confusion = confusion_matrix(y_val, all_preds)

    print('Validation Metrics:')
    print(f'Validation Accuracy: {val_accuracy:.2f}')
    print(f'Validation Precision: {val_precision:.2f}')
    print(f'Validation Recall: {val_recall:.2f}')
    print(f'Validation F1 Score: {val_f1:.2f}')
    print('Confusion Matrix (Validation):')
    print(val_confusion)

Validation Metrics:
Validation Accuracy: 0.82
Validation Precision: 0.61
Validation Recall: 0.52
Validation F1 Score: 0.56
Confusion Matrix (Validation):
[[606  64]
 [ 95 101]]


In [None]:
# Evaluation on the test set
model.eval()
with torch.no_grad():
    all_preds = []
    for inputs, mask, labels in test_loader:
        outputs = model(inputs, attention_mask=mask)[0]
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.tolist())

    test_accuracy = accuracy_score(y_test, all_preds)
    test_precision = precision_score(y_test, all_preds)
    test_recall = recall_score(y_test, all_preds)
    test_f1 = f1_score(y_test, all_preds)
    test_confusion = confusion_matrix(y_test, all_preds)

    print('Test Metrics:')
    print(f'Test Accuracy: {test_accuracy:.2f}')
    print(f'Test Precision: {test_precision:.2f}')
    print(f'Test Recall: {test_recall:.2f}')
    print(f'Test F1 Score: {test_f1:.2f}')
    print('Confusion Matrix (Test):')
    print(test_confusion)

Test Metrics:
Test Accuracy: 0.81
Test Precision: 0.60
Test Recall: 0.45
Test F1 Score: 0.51
Confusion Matrix (Test):
[[612  59]
 [108  88]]
