In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle
import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,roc_auc_score
from sklearn.preprocessing import StandardScaler
import pickle

# Load your data
dat = pd.read_csv("../input/balance-5000/balanced_5000_reviews.csv")

# Preprocess your DataFrame
dat = dat.drop(columns=['book_id', 'ratings_count', 'review_likes', 'like_share'])
dat["rating_diff"] = dat["user_rating"] - dat["avg_rating"]
dat = dat.drop(columns=['avg_rating'])
dat["quote"] = dat["review_text"].str.contains("\"")
dat["review_length"] = dat["review_text"].str.len()
# Drop rows with missing 'review_text'
dat = dat.dropna(subset=['review_text']).reset_index(drop=True)


# Initialize the tokenizer and model from the pre-trained BERT base uncased model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')  # Use BertModel
model.eval()  # Set the model to evaluation mode

# Function to get BERT embeddings using BertModel
def get_bert_embeddings(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the last hidden state as embeddings (alternative to pooler_output)
    embeddings = outputs.last_hidden_state[:, 0, :].squeeze()  # Get the embeddings of the [CLS] token
    return embeddings.numpy()  # Convert the tensor to a NumPy array

# Apply the function to the 'review_text' column to get embeddings
dat['bert_embeddings'] = dat['review_text'].apply(get_bert_embeddings)

# Tokenize, encode, and pad the reviews
max_sequence_length = 256  # Maximum sequence length
tokenized_reviews = [tokenizer.encode(review, add_special_tokens=True, max_length=max_sequence_length, truncation=True, padding='max_length') for review in dat['review_text']]

# Convert the tokenized reviews into tensors
input_ids = torch.tensor(tokenized_reviews)
attention_masks = torch.tensor([[float(i > 0) for i in seq] for seq in input_ids])

# Create a DataLoader
dataset = TensorDataset(input_ids, attention_masks)
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)

# Define a function to get sentiment predictions
def get_sentiment_predictions(model, dataloader):
    model.eval()  # Make sure the model is in evaluation mode
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_masks = batch
            outputs = model(input_ids, attention_mask=attention_masks)
            logits = outputs[0]
            probabilities = torch.softmax(logits, dim=1)
            predictions.extend(probabilities[:, 1].tolist())  # Assuming index 1 corresponds to positive sentiment
    return predictions

# Get sentiment predictions
sentiment_predictions = get_sentiment_predictions(model, dataloader)

# Add the predictions to the DataFrame
dat['sentiment_probabilities'] = sentiment_predictions

# Print the first few rows to verify
# print(dat.head())

# dat.to_csv("filtered_csv_with_sentiment.csv", index=False)

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Ensure you've downloaded the necessary NLTK data
nltk.download('punkt')

# Function to lemmatize text
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokens]
    return ' '.join(lemmatized_words)


# Apply lemmatization to the review_text column
# dat['lemmatized_text'] = dat['review_text'].apply(lemmatize_text)

# Function to get BERT embeddings for lemmatized text
def get_bert_embeddings(lemmatized_text):
    inputs = tokenizer(lemmatized_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()  # Taking mean of all token embeddings
    return embeddings.numpy()  # Convert the tensor to a NumPy array

# Apply the function to the 'lemmatized_text' column to get embeddings
dat['bert_embeddings'] = dat['review_text'].apply(get_bert_embeddings)

# Convert the list of embeddings into a DataFrame where each column represents one dimension of the embeddings
embeddings_df = pd.DataFrame(dat['bert_embeddings'].tolist())

# Prepare the feature matrix with other features
X = dat[['user_reviews', 'user_rating', 'days_since_review', 'rating_diff', 'quote', 'review_length']].copy()
X = X.apply(pd.to_numeric, errors='coerce')  # Ensure all data is numeric
X.fillna(0, inplace=True)

# Concatenate the embeddings DataFrame with the other features
X = pd.concat([X, embeddings_df], axis=1)

# Assuming 'popular' is your target variable
y = dat['popular']

# Concatenate features and target into a single DataFrame
data_with_target = pd.concat([X, y], axis=1)

# Write the DataFrame to a CSV file
data_with_target.to_csv('embed_5000_not_l.csv', index=False)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_d

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle
import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,roc_auc_score
from sklearn.preprocessing import StandardScaler
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
data_read = data_with_target

# Separate X and y
X_read = data_read.drop(columns=['popular']).values
y_read = data_read['popular'].values

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_read, y_read, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train[:, None], dtype=torch.float32)  # Reshape y to [n_samples, 1]
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test[:, None], dtype=torch.float32)

# Create TensorDatasets and DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the neural network architecture
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Initialize the model and move it to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device: ',device)
model = SimpleNN(input_size=X_train_tensor.shape[1]).to(device)

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Test the model
model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor.to(device)).cpu().numpy()
    y_pred = np.round(y_pred).flatten()

# Calculate accuracy, precision, and recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'ROC_AUC: {roc}')

device:  cpu
Epoch [1/50], Loss: 0.5646
Epoch [2/50], Loss: 0.5750
Epoch [3/50], Loss: 0.5727
Epoch [4/50], Loss: 0.5269
Epoch [5/50], Loss: 0.4666
Epoch [6/50], Loss: 0.2953
Epoch [7/50], Loss: 0.4204
Epoch [8/50], Loss: 0.2606
Epoch [9/50], Loss: 0.3823
Epoch [10/50], Loss: 0.2159
Epoch [11/50], Loss: 0.2173
Epoch [12/50], Loss: 0.1591
Epoch [13/50], Loss: 0.0912
Epoch [14/50], Loss: 0.1216
Epoch [15/50], Loss: 0.1520
Epoch [16/50], Loss: 0.0139
Epoch [17/50], Loss: 0.0152
Epoch [18/50], Loss: 0.0477
Epoch [19/50], Loss: 0.0942
Epoch [20/50], Loss: 0.0210
Epoch [21/50], Loss: 0.0068
Epoch [22/50], Loss: 0.0026
Epoch [23/50], Loss: 0.0163
Epoch [24/50], Loss: 0.0242
Epoch [25/50], Loss: 0.0018
Epoch [26/50], Loss: 0.0978
Epoch [27/50], Loss: 0.1331
Epoch [28/50], Loss: 0.0839
Epoch [29/50], Loss: 0.0470
Epoch [30/50], Loss: 0.0097
Epoch [31/50], Loss: 0.0032
Epoch [32/50], Loss: 0.0024
Epoch [33/50], Loss: 0.0018
Epoch [34/50], Loss: 0.0091
Epoch [35/50], Loss: 0.0014
Epoch [36/50], L