# Step 1: Identify and Anonymize PII

We'll use a pre-trained NER model from HuggingFace's transformers library to identify PII. For simplicity, we will consider names as PII.

In [2]:
import spacy
from transformers import pipeline

In [3]:
# python -m spacy download en
# Load a pretrained NER model
nlp = spacy.load("en_core_web_sm")

In [4]:
def anonymize_text(text):
    doc = nlp(text)
    anonymized_text = text
    for ent in doc.ents:
        if ent.label_ in ["PERSON"]:
            anonymized_text = anonymized_text.replace(ent.text, "[ANONYMIZED]")
    return anonymized_text
            

In [5]:
# example usage
text = "John Doe gave this movie a 5 star rating."
anonymized_text = anonymize_text(text)
print(anonymized_text)

[ANONYMIZED] gave this movie a 5 star rating.


# Step 2: Prepare and Stage Data

Assume we have a dataset with movie reviews. We'll split this dataset into training and testing sets.

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [7]:
# Load data to your dataset
data = {
    'review': [
        "John loved the movie", 
        "Jane hated the movie", 
        "Alice thought it was okay", 
        "Bob enjoyed the film but thought it was too long", 
        "Charlie was thrilled with the movie's plot",
        "Dave disliked the acting in the movie",
        "Eve said the movie was fantastic",
        "Frank found the movie boring",
        "Grace thought the movie was well-directed",
        "Hank was disappointed by the ending",
        "Ivy loved the cinematography in the movie",
        "Jack thought the movie was just average",
        "Karen enjoyed the soundtrack of the movie",
        "Leo thought the movie was a masterpiece",
        "Mary said the movie was terrible",
        "Nina found the movie thrilling",
        "Oscar thought the movie was too predictable",
        "Pam loved the character development in the movie",
        "Quincy disliked the movie's pacing",
        "Rita said the movie was a visual spectacle",
        "Sam enjoyed the movie but thought it was a bit slow",
        "Tina was impressed by the movie's special effects",
        "Uma thought the movie was poorly written",
        "Victor loved the movie's action sequences",
        "Wendy hated the movie's dialogue",
        "Xander thought the movie was great fun",
        "Yara found the movie confusing",
        "Zane thought the movie was overrated",
        "Amy said the movie was a must-watch",
        "Bill found the movie to be a waste of time"
    ],
    'rating': [
        5, 1, 3, 4, 5, 
        2, 5, 1, 4, 2, 
        5, 3, 4, 5, 1, 
        5, 2, 5, 2, 5, 
        3, 4, 1, 5, 1, 
        4, 2, 2, 5, 1
    ]
}


In [8]:
df = pd.DataFrame({
    'review':data['review'],
    'rating':data['rating']    
})


In [9]:
df.head()

Unnamed: 0,review,rating
0,John loved the movie,5
1,Jane hated the movie,1
2,Alice thought it was okay,3
3,Bob enjoyed the film but thought it was too long,4
4,Charlie was thrilled with the movie's plot,5


In [10]:
df['review'] = df['review'].apply(anonymize_text)

In [11]:
df.head()

Unnamed: 0,review,rating
0,[ANONYMIZED] loved the movie,5
1,[ANONYMIZED] hated the movie,1
2,[ANONYMIZED] thought it was okay,3
3,[ANONYMIZED] enjoyed the film but thought it w...,4
4,[ANONYMIZED] was thrilled with the movie's plot,5


In [12]:
# Split data
train_df, test_df = train_test_split(df, test_size=.2, random_state=42)
# Save the splits
train_df.to_csv("train_data.csv", index=False)
test_df.to_csv("test_data.csv", index=False)

# Step 3: Train a Sentiment Analysis Model

We'll use a simple LSTM-based model in PyTorch for sentiment analysis.

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim 
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
from torchtext.datasets import AG_NEWS
from torch.utils.data import DataLoader

In [14]:
# Define a tokenizer
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [21]:
# Load and Process the data
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

In [18]:
# Prepare and Train the datasets
train_iter = [(row['rating'], row['review']) for _, row in train_df.iterrows()]
test_iter = [(row['rating'], row['review']) for _, row in test_df.iterrows()]

In [20]:
train_iter[:5]

[(5, '[ANONYMIZED] said the movie was a must-watch'),
 (1, "[ANONYMIZED] hated the movie's dialogue"),
 (4, '[ANONYMIZED] enjoyed the soundtrack of the movie'),
 (5, '[ANONYMIZED] loved the movie'),
 (5, "[ANONYMIZED] was thrilled with the movie's plot")]

In [22]:
# Build Vocabulary
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

# Text pipeline
text_pipeline = lambda x:vocab(tokenizer(x))
label_pipeline = lambda x: int(x)


In [23]:
# Prepare Dataloader
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    return torch.tensor(label_list, dtype=torch.float32), torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True), torch.tensor(lengths,dtype=torch.int64) 

In [24]:
# Create DataLoader
train_dataloader = DataLoader(to_map_style_dataset(train_iter), batch_size=8, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(to_map_style_dataset(test_iter), batch_size=8, shuffle=True, collate_fn=collate_batch)

In [26]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        else:
            hidden = self.dropout(hidden[-1, :, :])
        return self.fc(hidden)

In [33]:
# Hyperparameters
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = LSTMModel(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [34]:
# Training Loop
optimizer = optim.Adam(model.parameters())
criterion = nn.MSELoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

In [36]:
def train(model, dataloader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for labels, texts, lengths in dataloader:
        optimizer.zero_grad()
        labels, texts, lengths = labels.to(device), texts.to(device), lengths.to(device)
        predictions = model(texts, lengths).squeeze(1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

N_EPOCHS = 10

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, criterion)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.3f}')

Epoch 1, Train Loss: 7.600
Epoch 2, Train Loss: 3.364
Epoch 3, Train Loss: 4.087
Epoch 4, Train Loss: 2.361
Epoch 5, Train Loss: 2.623
Epoch 6, Train Loss: 2.660
Epoch 7, Train Loss: 2.080
Epoch 8, Train Loss: 1.935
Epoch 9, Train Loss: 1.930
Epoch 10, Train Loss: 1.902


In [64]:
test_iter

[(2, 'Zane thought the movie was overrated'),
 (5, 'Nina found the movie thrilling'),
 (5, "Victor loved the movie's action sequences"),
 (5, 'Pam loved the character development in the movie'),
 (4, 'Grace thought the movie was well-directed'),
 (2, '[ANONYMIZED] was disappointed by the ending')]

In [65]:
# Function to preprocess new data
def preprocess(text, tokenizer, vocab):
    tokens = tokenizer(text)
    token_indices = [vocab[token] for token in tokens]
    return token_indices

In [67]:
preprocess("Farooq said okay", tokenizer, vocab)

[0, 10, 49]

In [68]:
# Function for do predictions
def predict (text, model, tokenizer, vocab, device):
    model.eval()
    token_indices = preprocess(text, tokenizer, vocab)
    text_tensor = torch.tensor([token_indices], dtype=torch.int64, device=device)
    text_length = torch.tensor([len(token_indices)], dtype=torch.int64, device=device)

    with torch.no_grad():
        output = model(text_tensor, text_length)
    return output.item()

In [77]:
# Example new data
new_reviews = [
    "I loved the movie, it was fantastic!",
    "The film was terrible, I hated it.",
    "worst movie"
]

# Make predictions
for review in new_reviews:
    prediction = predict(review, model, tokenizer, vocab, device)
    print(f"Review: {review}")
    print(f"Predicted Rating: {prediction:.2f}")

Review: I loved the movie, it was fantastic!
Predicted Rating: 3.66
Review: The film was terrible, I hated it.
Predicted Rating: 2.43
Review: worst movie
Predicted Rating: 1.24
