<a href="https://colab.research.google.com/github/robertjprior/CausaLM/blob/master/Marketing_Email_Personalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
config = {'train': False,
          'generate_baseline_scores': False,
          'read_original_data_text_files': False,
          'recreate_word_impact_tracker': False,
          'special': "####"}


In [None]:
!pip install transformers
!pip install datasets



In [None]:
from tqdm import tqdm

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

from datasets import Dataset, DatasetDict, ClassLabel
from pathlib import Path

In [None]:
import os, sys
from google.colab import drive

nb_path = '/content/text_attribution_references'
try:
  drive.mount('/content/drive')
  #mounting: https://stackoverflow.com/questions/55253498/how-do-i-install-a-library-permanently-in-colab

  os.symlink('/content/drive/MyDrive/Colab Notebooks/WordAttributionReferences', nb_path)

  sys.path.insert(0,nb_path)
except FileExistsError:
  pass

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Importing the dataset
We'll use pandas to read the dataset and load it into a dataframe.

In [None]:
def folder_to_dataframe(folder_path):
  data = []
  for file_path in folder_path.glob('*.txt'):
    # Extract the id from the filename
    id = int(file_path.stem.split('_')[0])

    # Read the contents of the file
    with file_path.open('r') as file:
        file_content = file.read()

    # Append the data to the list
    data.append([file_content, id])

  # Create a DataFrame
  df = pd.DataFrame(data, columns=['text', 'mapping_loc'])
  return df
if config['read_original_data_text_files']:
  df_pos = folder_to_dataframe(Path(nb_path, 'pos'))
  df_pos['label'] = 1
  df_pos.to_csv(Path(nb_path, 'df_pos.csv'))

In [None]:
name_mapping = pd.read_csv(Path(nb_path, 'movies_metadata.csv'))

In [None]:

if config['read_original_data_text_files']:
  df_neg = folder_to_dataframe(Path(nb_path, 'neg'))
  df_neg['label'] = 0
  df_neg.to_csv(Path(nb_path, 'df_neg.csv'))
else:
  df_pos = pd.read_csv(Path(nb_path, 'df_pos.csv'))
  df_neg = pd.read_csv(Path(nb_path, 'df_neg.csv'))



In [None]:
pos_url = pd.read_csv(Path(nb_path, 'urls_pos.txt'), sep=" ", header=None)
pos_url.reset_index(inplace=True)
pos_url = dict(pos_url.values)
neg_url = pd.read_csv(Path(nb_path, 'urls_neg.txt'), sep=" ", header=None)
neg_url.reset_index(inplace=True)
neg_url = dict(neg_url.values)

In [None]:
def add_url(df, url_mapping):
  df_url_column = []
  for index, row in df.iterrows():
    url = url_mapping[row['mapping_loc']]
    movie_id = url.replace('http://www.imdb.com/title/' , '').replace('/usercomments', '')
    df_url_column.append(movie_id)
  df["imdb_id"] = df_url_column
  return df

In [None]:
df_pos = add_url(df_pos, pos_url)
df_neg = add_url(df_neg, neg_url)

In [None]:
df = pd.concat([df_pos, df_neg], axis=0)

In [None]:
#old smaller dataset
# df = pd.read_csv(Path(nb_path, 'ratings_small.csv'))
# id_mapping = pd.read_csv(Path(nb_path, 'links_small.csv'))

# #df.index = df.movieId
# #id_mapping.index = id_mapping.movieId
# df = pd.merge(df, id_mapping, left_on='movieId', right_on='movieId', how='inner')

In [None]:
df = pd.merge(df, name_mapping, left_on='imdb_id', right_on='imdb_id', how='inner')

In [None]:
df = df.loc[:, ['text', 'label', 'title', 'imdb_id']]


In [None]:
df.text = df.text.str.slice(0,250)
df.head()
df.loc[0, 'text']

"Powers Boothe turns in a stellar performance as 1970's cult figure Jim Jones of the Peoples Temple. Jones physical likeness to Jones is uncanny and the story is acted out chillingly. The movie keeps you riveted and is a must see for anyone. check it "

In [None]:
df.shape

(16839, 4)

In [None]:
#df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
#df.columns = ['text', 'label']

For performance reasons, we'll only use 2,000 sentences from the dataset

## Loading the Pre-trained BERT model & Defining Architecture
Let's now load a pre-trained BERT model.

In [None]:
TEST_SIZE = 0.1
batch_size = 20
DROPOUT = 0.5
NUM_LABELS = 2
learning_rate = 1e-3
num_train_epochs = 25

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'


In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights, return_dict=True)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import torch
from torch import nn

class DistilBertClassifier(nn.Module):
    def __init__(self, pretrained_model, num_labels=NUM_LABELS, dropout=DROPOUT, averaging = "last four", ):
        super(DistilBertClassifier, self).__init__()
        self.num_labels = num_labels
        self.averaging = averaging

        self.dropout = nn.Dropout(dropout)
        self.bert = pretrained_model #RobertaModel.from_pretrained("roberta-base", return_dict=True)
        self.hidden_size = self.bert.config.hidden_size

        self.dense = nn.Linear(self.hidden_size, self.hidden_size) #https://github.com/google-research/bert/issues/43
        #https://discuss.huggingface.co/t/what-is-the-purpose-of-the-additional-dense-layer-in-classification-heads/526
        self.linear = nn.Linear(self.hidden_size, num_labels)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()
        if self.averaging == "last four":
            self.hidden_size = self.hidden_size *4
            self.dense = nn.Linear(self.hidden_size, self.hidden_size) #https://github.com/google-research/bert/issues/43
            self.linear = nn.Linear(self.hidden_size, self.num_labels)


    def forward(self, input_ids, attention_mask):

        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True, return_dict=True)
        if self.averaging=="last":
            #average across this middle
            sentence_representation = torch.mean(outputs['last_hidden_state'], 1)
            #TODO: should ultimately try to avoid padding tokens https://stackoverflow.com/questions/71434804/how-to-fed-last-4-concatenated-hidden-layers-of-bert-to-fc-layers
        elif self.averaging == "last four":
            feature_layers = outputs['hidden_states'][-4:]
            sentence_representation = torch.cat(feature_layers, -1) #concatenate them (here over the last dimension) to a single tensor of shape (batch_size, seq_len, 4 * hidden_size)
            #sentence_representation = torch.mean(sentence_representation, 1)

            #alternative that avoids taking the mean of paddings in there
            sentence_representation = sentence_representation[:,0,:]
            #sentence_representation.size() #torch.Size([20, 3072])
            #sentence_representation = sentence_representation.to(self.device)



        else: #if none
            sentence_representation = outputs['last_hidden_state'][:, 0, :] #cls token
        x = self.dropout(sentence_representation)
        x = self.dense(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear(x)
        #x = self.softmax(x)
        return x

In [None]:
model = DistilBertClassifier(pretrained_model = model, num_labels = NUM_LABELS, dropout = DROPOUT, averaging = "last four")

In [None]:
model = model.to(device)

In [None]:
#validate everything is on the device
#print(device)
#for param in model.parameters():
#    print(type(param), param.size(), param.device)


## Data -> Dataloader #1: Preparing the Dataset
Before we can hand our sentences to BERT, we need to do some minimal processing to put them in the format it requires. Includes 1) splitting data into train validate test sets, 2) loading them into a Dataset format so they can be saved to disk until their batch is called, 3) tokenization, 4) add the label in correct format to the dataset, 5) create a data collator that will do padding only when dataset is called to save RAM, 6) dataloader is setup that will run the collator function too when the batch is called



In [None]:
def create_tokenized_datasets(tokenizer, datafile_name, label_col_name, text_col_name):
    """returns a huggingface DatasetDict object with train, validate, and test columns. \
    There should also be two columns in each dataset we are interested in "text" and "label" \
    Trainer object will automatically move things to a tensor as needed for us. """


    #load dataset class object
    df, labels = pytorch_dataset(datafile_name, label_col_name)
    #transform dataset label

    def tokenize_function(example):
        #old handling: tokenized_outputs = tokenizer(text, return_tensors="pt")
        tokens = tokenizer(example[text_col_name], truncation=True, padding=False)
        tokens['labels'] = labels.str2int(example[label_col_name])
        return tokens

    #tokenize dataset (doing it this way so the results get pushed back as new columns in Datasets format stored on Disk instead of returning dictionary stored in RAM)
    tokenized_datasets = df.map(tokenize_function, batched=True)
    return tokenized_datasets



def pytorch_dataset(filename, label_col_name):
    from datasets import Dataset, DatasetDict, ClassLabel
    train, validate, test, labels_set = optimization_read_split_data(
        df = filename,
        test_size=TEST_SIZE,
        label_col_name=label_col_name,
    )
    train = Dataset.from_pandas(train)
    validate = Dataset.from_pandas(validate)
    test = Dataset.from_pandas(test)
    dataset = DatasetDict({
        "train": train,
        "validate": validate,
        "test": test})
    labels = ClassLabel(names = list(labels_set))
    return dataset, labels

def optimization_read_split_data(df, test_size, label_col_name):
    #df = pd.read_csv(path)
    #y = df[label_col_name].astype(str)
    #df = df.drop(columns=[label_col_name])

    #X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=test_size, stratify=y)
    #X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=test_size, stratify=y_train)
    #return X_train.values.tolist(), X_val.values.tolist(), X_test.values.tolist(), y_train, y_val, y_test
    train, test = train_test_split(df, test_size=test_size, stratify=df[label_col_name])
    train, validate = train_test_split(train, test_size=test_size, stratify=train[label_col_name])
    labels = set(df[label_col_name])
    return train, validate, test, labels
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
#ignore
def create_single_tokenized_dataset(tokenizer, datafile_name, label_col_name, text_col_name):
    """returns a huggingface DatasetDict object with train, validate, and test columns. \
    There should also be two columns in each dataset we are interested in "text" and "label" \
    Trainer object will automatically move things to a tensor as needed for us. """


    #load dataset class object
    df, labels = pytorch_single_dataset(datafile_name, label_col_name)
    #transform dataset label

    def tokenize_function(example):
        #old handling: tokenized_outputs = tokenizer(text, return_tensors="pt")
        tokens = tokenizer(example[text_col_name], truncation=True, padding=False, return_tensors="pt")
        tokens['labels'] = labels.str2int(example[label_col_name])
        return tokens

    #tokenize dataset (doing it this way so the results get pushed back as new columns in Datasets format stored on Disk instead of returning dictionary stored in RAM)
    tokenized_datasets = df.map(tokenize_function, batched=True)
    return tokenized_datasets



def pytorch_single_dataset(filename, label_col_name):
    from datasets import Dataset, DatasetDict, ClassLabel
    train, labels_set = optimization_read_split_data_single(
        df = filename,
        test_size=TEST_SIZE,
        label_col_name=label_col_name,
    )
    train = Dataset.from_pandas(train)

    dataset = DatasetDict({
        "full": train})
    labels = ClassLabel(names = list(labels_set))
    return dataset, labels

def optimization_read_split_data_single(df, test_size, label_col_name):


    labels = set(df[label_col_name])
    return df, labels
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
tokenized_df = create_tokenized_datasets(tokenizer, df, 'label', 'text')

tokenized_df = tokenized_df.remove_columns(["text", "label", "__index_level_0__", "title", "imdb_id"])


Map:   0%|          | 0/13639 [00:00<?, ? examples/s]

Map:   0%|          | 0/1516 [00:00<?, ? examples/s]

Map:   0%|          | 0/1684 [00:00<?, ? examples/s]

In [None]:
tokenized_df

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 13639
    })
    validate: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1516
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1684
    })
})

In [None]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding = True)

train_loader = DataLoader(tokenized_df['train'], collate_fn=data_collator, batch_size=batch_size, shuffle=True)
validate_loader = DataLoader(tokenized_df['validate'], collate_fn=data_collator, batch_size=batch_size)
test_loader = DataLoader(tokenized_df['test'], collate_fn=data_collator, batch_size=batch_size)

#show collator working - this example wont work, it pads to the batch size, since this is 1 its just 1
#example = tokenizer(df['text'][1], truncation=True, padding = False)
#example2 = data_collator(example)
#example2['input_ids'].shape
#print(len(example['input_ids']))
#print(example2['input_ids'].shape)

In [None]:
#to view a batch and validate padding
#next(iter(train_loader))

#tokenizer.decode([0])

Example walkthough of the base bert model

In [None]:
# batch = next(iter(train_loader))
# with torch.no_grad():
#     output = model(batch['input_ids'], batch['attention_mask'], output_hidden_states=True)

# output.last_hidden_state.size() #(batch_size, sequence_length, hidden_size)
# print(len(output.hidden_states))
# print(output.hidden_states[0].size())

# feature_layers = output['hidden_states'][-4:]
# sentence_representation = torch.cat(feature_layers, -1) #concatenate them (here over the last dimension) to a single tensor of shape (batch_size, seq_len, 4 * hidden_size)
# #sentence_representation = torch.mean(sentence_representation, 1)

# #alternative that avoids taking the mean of paddings in there
# sentence_representation = sentence_representation[:,0,:]
# #sentence_representation.size() #torch.Size([20, 3072])

In [None]:
# #and the finetuning bert model
# batch = next(iter(train_loader))
# with torch.no_grad():
#     output = model.forward(batch['input_ids'], batch['attention_mask'])
# output


In [None]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(params = model.parameters(), lr=learning_rate)

#optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

trial = None

In [None]:
#Need to enable my custom function to go to cuda

In [None]:
from pathlib import Path
if config['train'] == True:
  val_historical_accuracy = []
  train_historical_accuracy = []
  for epoch in range(num_train_epochs):
      #print(f"Epoch #: {epoch}")
      tr_loss = 0
      n_correct = 0
      nb_tr_steps = 0
      nb_tr_examples = 0
      model.train()
      for batch_idx, data in enumerate(tqdm(train_loader)):
          optimizer.zero_grad()

          ids = data['input_ids'].to(device, dtype = torch.long)
          mask = data['attention_mask'].to(device, dtype = torch.long)
          #token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
          targets = data['labels'].to(device, dtype = torch.long)
          #print(model.is_cuda)

          outputs = model(ids, mask)
          loss = loss_function(outputs, targets)
          tr_loss += loss.item()
          big_val, big_idx = torch.max(outputs.data, dim=1)
          n_correct += calcuate_accuracy(big_idx, targets)

          nb_tr_steps += 1
          nb_tr_examples+=targets.size(0)

          if (batch_idx%1000==0) and (batch_idx != 0):
              print(batch_idx)
              loss_step = tr_loss/nb_tr_steps
              accu_step = (n_correct*100)/nb_tr_examples
              print(f"Training Loss per 5000 steps: {loss_step}")
              print(f"Training Accuracy per 5000 steps: {accu_step}")


          loss.backward()
          optimizer.step()

      # Validation of the model.
      model.eval()
      correct = 0
      eval_nb_tr_examples = 0
      val_loss = 0

      with torch.no_grad():
          for batch_idx, data in enumerate(tqdm(validate_loader)):
              ids = data['input_ids'].to(device, dtype = torch.long)
              mask = data['attention_mask'].to(device, dtype = torch.long)
              #token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
              targets = data['labels'].to(device, dtype = torch.long)
              outputs = model(ids, mask)
              loss = loss_function(outputs, targets)
              val_loss += loss.item()
              # Get the index of the max log-probability.
              pred = outputs.argmax(dim=1, keepdim=True)
              correct += pred.eq(targets.view_as(pred)).sum().item()
              eval_nb_tr_examples+=targets.size(0)
      #if not epoch%10:
      accuracy = correct / eval_nb_tr_examples

      val_historical_accuracy.append(accuracy)
      train_historical_accuracy.append((n_correct*100)/nb_tr_examples)

      print(
          f"Epoch: {epoch:02d} | "
          f"train_loss: {tr_loss:.5f}, "
          f"train_accuracy: {(n_correct*100)/nb_tr_examples:.5f}, "
          f"val_loss: {val_loss:.5f}, "
          f"val accuracy: {accuracy:.5f}")

  torch.save(model, Path(nb_path, "finetuned_model"))
  pd.DataFrame(val_historical_accuracy).to_csv(Path(nb_path, 'val_historical_accuracy'))
  pd.DataFrame(train_historical_accuracy).to_csv(Path(nb_path, 'train_historical_accuracy'))
else:
  model = torch.load(Path(nb_path, "finetuned_model"), map_location=torch.device(device)) #map location helps us load onto CPU or GPU, even if trained only on gpu



# Setup Work Importance Attribution Model

In [None]:
#NOW we are going to do tfidf to generate a matrix
!pip install sklearn


Collecting sklearn
  Downloading sklearn-0.0.post5.tar.gz (3.7 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [None]:
df['label'] = df['label'].astype(str)
softmax = torch.nn.Softmax(dim=0)
labels_set = set(df['label'])

labels = ClassLabel(names = list(labels_set))

In [None]:
def tokenize_single_function(example, label_col_name, text_col_name, label_converter):
    #old handling: tokenized_outputs = tokenizer(text, return_tensors="pt")
    tokens = tokenizer(example[text_col_name], truncation=True, padding=False, return_tensors="pt")
    tokens['labels'] = label_converter.str2int(example[label_col_name])
    return tokens

def update_matrix_word_count(row, word, word_mapping):
    return None


def calculate_attribution_score(review, model, word_set):
  #TODO: start here...
  #calculate baseline score
  return None

def get_prediction(tokenized_data, model):
  model.eval()
  ids = tokenized_data['input_ids'].to(device, dtype = torch.long)
  mask = tokenized_data['attention_mask'].to(device, dtype = torch.long)
  output = model(ids, mask)
  return output

def predict(model, example, label_col_name, text_col_name, label_converter):
  tokenized_data = tokenize_single_function(example, label_col_name, text_col_name, label_converter)
  output = get_prediction(tokenized_data, model)
  return output

import numpy as np
def softmax2(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

def predict_normalize(model, example, label_col_name, text_col_name, label_converter, softmax):
  output = predict(model, example, label_col_name, text_col_name, label_converter)
  return softmax(torch.flatten(output))


In [None]:
labels.names

['0', '1']

In [None]:
# #EXAMPLE PREDICTION
# labels_set = set(df['label'])

# labels = ClassLabel(names = list(labels_set))
# #tokenize_single_function(df.loc[0], 'label', 'text', labels)
# tokenizer(df.loc[0]['text'], truncation=True, padding=False)
# labels.str2int(str(df.loc[0]['label']))



# for index, row in df.iloc[0:3, :].iterrows():
#   print(row)
#   output = predict(model, row, 'label', 'text', labels)
#   print(softmax(torch.flatten(output)))



In [None]:
!pip install --user -U nltk
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn



[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1,1), max_df = 0.05, min_df=0.001, stop_words='english')
#vectorizer = TfidfVectorizer(ngram_range = (1,2), max_features=2000)


In [None]:
X = vectorizer.fit_transform(df['text'].replace(',', '').replace('.', '').replace('"', "").str.lower())
tfidf_word_set = vectorizer.get_feature_names_out()

In [None]:
len(tfidf_word_set)

3085

In [None]:
vocab = []
for w in tfidf_word_set:
  tmp = wn.synsets(w, pos=wn.VERB)
  tmp.extend(wn.synsets(w, pos=wn.ADJ))
  if len(tmp) > 0:
    vocab.append(w)
    #pairing.append(str(w) + f": {tmp[0].pos()}")
len(vocab)

1894

In [None]:
#only use the verb and adj words
vectorizer = TfidfVectorizer(ngram_range = (1,1), stop_words='english', vocabulary=vocab)

In [None]:
X = vectorizer.fit_transform(df['text'].replace(',', '').replace('.', '').replace('"', "").str.lower())
tfidf_word_set = vectorizer.get_feature_names_out()
X.shape

(16839, 1894)

In [None]:
tfidf_word_set_encoding = dict(zip(tfidf_word_set, range(tfidf_word_set.shape[0])))

In [None]:
if config['generate_baseline_scores'] is True:
  #baseline scores
  baseline_scores = []
  for review_line_i, review_line in df.iterrows():
    output = predict(model, review_line, 'label', 'text', labels)
    baseline_scores.append(softmax(torch.flatten(output)).tolist())
  baseline_scores = pd.DataFrame(baseline_scores, columns=labels.names)
  baseline_scores.to_parquet(Path(nb_path, 'baseline_scores'))
else:
  baseline_scores = pd.read_parquet(Path(nb_path, 'baseline_scores'))
baseline_scores[0:10]

Unnamed: 0,0,1
0,0.017245,0.982755
1,0.821106,0.178894
2,0.106697,0.893303
3,0.024011,0.975989
4,0.042736,0.957264
5,0.074504,0.925496
6,0.01606,0.98394
7,0.471298,0.528702
8,0.021723,0.978277
9,0.014535,0.985465


# Estimate Work Importance

In [None]:
#generate the empty array to hold the change in score for each word change
#train, test = train_test_split(df, test_size=0.2, stratify=df['label'])
train = df.copy()
#train.reset_index(inplace=True, drop=True)
word_impact_tracker = np.zeros((train.shape[0], len(tfidf_word_set)))


In [None]:
train.shape


(16839, 4)

In [None]:
tfidf_word_set

array(['10', '100', '11', ..., 'young', 'younger', 'zero'], dtype=object)

In [None]:
#trial of this


import json
if config['recreate_word_impact_tracker'] is True:
  for new_review_line_i, (original_review_line_i, review_line) in enumerate(tqdm(train.iterrows(), total=train.shape[0])):
    output = vectorizer.transform([review_line['text'].lower()])
    keywords = vectorizer.inverse_transform(output)[0]
    for word in keywords:
      edited_review = review_line['text'].replace(word, '')
      #print(word)
      #print(review_line)
      #print(edited_review)

      output = predict_normalize(model, review_line, 'label', 'text', labels, softmax).tolist()
      baseline_output = baseline_scores.loc[original_review_line_i]

      delta = (output[0] - baseline_output[0])/baseline_output[0]
      column_id = tfidf_word_set_encoding[word]
      word_impact_tracker[new_review_line_i, column_id] += delta

  #save the results
  word_impact_tracker = pd.DataFrame(word_impact_tracker, columns=tfidf_word_set)
  word_impact_tracker.to_csv(Path(nb_path, 'word_impact_tracker'))

  with open(Path(nb_path, 'word_positional_encoder.json'), 'w') as fp:
      json.dump(tfidf_word_set_encoding, fp)

else:
  word_impact_tracker = pd.read_csv(Path(nb_path, 'word_impact_tracker'))
  with open(Path(nb_path, 'word_positional_encoder.json'), 'r') as fp:
      tfidf_word_set_encoding = json.load(fp)
  #now we process the review

In [None]:
word_impact_tracker.sum().sum()

141806779.8875972

### Average out Impacts

In [None]:
#average out impact of each word
word_impact_tracker.values[word_impact_tracker == 0.0] = np.nan


In [None]:
word_impact_tracker['title'] = df['title']

In [None]:
nonempty_tfidf_word_set = list(word_impact_tracker.columns)[:-1] #leave out "title"

In [None]:
avg_impact_bytitle = word_impact_tracker.groupby('title')[nonempty_tfidf_word_set].mean()

In [None]:
avg_impact_bytitle.to_csv(Path(nb_path, 'avg_impact_bytitle.csv'))

In [None]:
avg_impact_bytitle.head()

Unnamed: 0_level_0,Unnamed: 0,10,100,11,12,13,14,15,16,17,...,worthless,worthy,wow,write,writing,written,wrong,wrote,young,younger
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Items or Less,6583.0,-0.092528,0.0,0.0,0.000852,0.0,0.0,0.0,0.0,0.0,...,-0.030725,0.0,0.0,0.0,0.0,-0.074295,0.0,0.0,0.0,0.0
101 Dalmatians II: Patch's London Adventure,11698.5,0.013705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102 Dalmatians,4824.0,0.001268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000521,0.0,0.0,0.0
11'09''01 - September 11,2816.0,0.0,0.0,0.653062,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13 Going on 30,14907.5,0.0,0.0,0.0,0.0,0.096509,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#mean just overall
overall_means = np.nanmean(word_impact_tracker.drop('title', axis=1), axis=0)
overall_means = pd.DataFrame(overall_means, index=word_impact_tracker.drop('title', axis=1).columns)


In [None]:
overall_means

Unnamed: 0,0
Unnamed: 0,8419.000000
10,0.003366
100,-0.000325
11,0.000593
12,-0.000536
...,...
wrong,0.003161
wrote,0.012675
young,0.010261
younger,-0.000086


In [None]:
overall_means.dropna(inplace=True)
top10_positive = overall_means.sort_values([0], ascending=True).iloc[0:20, :]
top10_negative = overall_means.sort_values([0], ascending=False).iloc[0:20, :]

In [None]:
top10_negative

Unnamed: 0,0
Unnamed: 0,8419.0
new,0.04234
interesting,0.020173
make,0.018175
read,0.017567
early,0.017148
watching,0.016934
did,0.016857
believe,0.015699
old,0.015634


In [None]:
top10_positive

Unnamed: 0,0
wonderful,-0.009198
funny,-0.006941
loved,-0.006747
amazing,-0.005798
hilarious,-0.004676
enjoyed,-0.00456
superb,-0.004144
greatest,-0.003888
true,-0.003781
entertaining,-0.00375


# Run Sample through ChatGPT

In [None]:
titles_set = list(set(df['title']))

In [None]:
def pull_movie_keywords(title, avg_impact_bytitle):
  movie_keywords = avg_impact_bytitle.loc[[title]].T
  top20_negative = list(movie_keywords.sort_values(title, ascending=True).iloc[0:10, :].index)
  top20_positive = list(movie_keywords.sort_values(title, ascending=False).iloc[0:10, :].index)
  return top20_positive, top20_negative


In [None]:
#EXAMPLE
print(titles_set[0])
output = pull_movie_keywords(titles_set[0], avg_impact_bytitle)

output = pd.DataFrame(output).T
output.columns = ["Positive", "Negative"]
list(output['Positive'])

Czech Dream


['Unnamed: 0',
 'word',
 'dream',
 'saying',
 'thinks',
 'use',
 'inspired',
 'main',
 'act',
 'documentary']

In [None]:
output = pull_movie_keywords("The Mummy's Tomb", avg_impact_bytitle)

output = pd.DataFrame(output).T
output.columns = ["Positive", "Negative"]
output

Unnamed: 0,Positive,Negative
0,Unnamed: 0,previous
1,hand,living
2,older,new
3,character,budget
4,played,spent
5,direct,goes
6,lead,running
7,help,post
8,universal,potential
9,final,leading


In [None]:

output = pull_movie_keywords('Bend It Like Beckham', avg_impact_bytitle)

output = pd.DataFrame(output).T
output.columns = ["Positive", "Negative"]
output

Unnamed: 0,Positive,Negative
0,Unnamed: 0,sports
1,clue,real
2,believe,cool
3,waited,hard
4,wow,characters
5,american,say
6,long,british
7,know,little
8,got,lot
9,watching,loved


In [None]:
!pip install openai

Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.8


In [None]:
import os
import openai
openai.api_type = "azure"
openai.api_base = "https://accelerator-instance-personalisedmarketing.openai.azure.com/"
openai.api_version = "2023-03-15-preview"
openai.api_key = config['special']



In [None]:
prompt = "Happy July 4th! We think you might enjoy settling in after the fireworks with family and friends to watch this \
movie (insert movie title there) because..."

recommended_movie = "Czech Dream"
output = pull_movie_keywords(recommended_movie, avg_impact_bytitle)

output = pd.DataFrame(output).T
output.columns = ["Positive", "Negative"]
pos_keywords = list(output["Positive"])
neg_keywords = list(output['Negative'])


response = openai.ChatCompletion.create(
  engine="Accelerator-Deployment-PersonalisedMarketing",
  messages = [{"role":"system","content":f"write a professional marketing email that will recommend a movie we specify below to the end user. Start with the "
  "prompt listed below, then factor in the movie recommendation below, and finally finetune the prompt to account for the positive or negative "
  "keywords we pass in as well to know the types of wording that is successful or not for that movie. add in the reasoning for why they "
  "should watch it, inserted after the because... in the prompt, based on what you know about the movie and those keywords passed in. "
  "Finally, rate how we did out of 10 for a marketing message that would drive conversion and where the keywords played a role in crafting the message"
  f"\n prompt: {prompt} . "
  f"movie recommendation: {recommended_movie} . "
  f"positive keywords: {pos_keywords} . "
  f"negative keywords (avoid): {neg_keywords}"}],
  temperature=0.7,
  max_tokens=800,
  top_p=0.95,
  frequency_penalty=0,
  presence_penalty=0,
  stop=None)
response

<OpenAIObject chat.completion id=chatcmpl-7bzi5ZFJ4dPLlkLRWzK9pMRJLdTy4 at 0x78d2c44ed2b0> JSON: {
  "id": "chatcmpl-7bzi5ZFJ4dPLlkLRWzK9pMRJLdTy4",
  "object": "chat.completion",
  "created": 1689289705,
  "model": "gpt-35-turbo",
  "choices": [
    {
      "index": 0,
      "finish_reason": "stop",
      "message": {
        "role": "assistant",
        "content": "Subject: Celebrate July 4th with an Inspiring Documentary!\n\nDear [Recipient],\n\nHappy July 4th! We hope you had a blast watching the fireworks with your loved ones. If you're looking for a great way to end your night, we recommend settling in with family and friends to watch the inspiring documentary, Czech Dream.\n\nCzech Dream is a thought-provoking film that explores the power of advertising and consumerism in our society. The documentary follows two film students who create a fake hypermarket, complete with a huge advertising campaign, to see how far people will go to chase the dream of consumption. What they discov

In [None]:
recommended_movie = "The Mummy's Tomb"
output = pull_movie_keywords(recommended_movie, avg_impact_bytitle)

output = pd.DataFrame(output).T
output.columns = ["Positive", "Negative"]
pos_keywords = list(output["Positive"])
neg_keywords = list(output['Negative'])


response = openai.ChatCompletion.create(
  engine="Accelerator-Deployment-PersonalisedMarketing",
  messages = [{"role":"system","content":f"write a professional marketing email that will recommend a movie we specify below to the end user. Start with the "
  "prompt listed below, then factor in the movie recommendation below, and finally finetune the prompt to account for the positive or negative "
  "keywords we pass in as well to know the types of wording that is successful or not for that movie. add in the reasoning for why they "
  "should watch it, inserted after the because... in the prompt, based on what you know about the movie and those keywords passed in. "
  "Finally, rate how we did out of 10 for a marketing message that would drive conversion and where the keywords played a role in crafting the message."
  f"\n prompt: {prompt} . "
  f"movie recommendation: {recommended_movie} . "
  f"positive keywords: {pos_keywords} . "
  f"negative keywords (avoid): {neg_keywords}"}],
  temperature=0.7,
  max_tokens=800,
  top_p=0.95,
  frequency_penalty=0,
  presence_penalty=0,
  stop=None)
response

<OpenAIObject chat.completion id=chatcmpl-7bziFlGUSopFi2vfZmqnCjZcZrE5w at 0x78d2c612d210> JSON: {
  "id": "chatcmpl-7bziFlGUSopFi2vfZmqnCjZcZrE5w",
  "object": "chat.completion",
  "created": 1689289715,
  "model": "gpt-35-turbo",
  "choices": [
    {
      "index": 0,
      "finish_reason": "stop",
      "message": {
        "role": "assistant",
        "content": "Subject: Celebrate July 4th with The Mummy's Tomb!\n\nDear [Name],\n\nHappy July 4th! After the fireworks are over and the BBQ has ended, it's time to settle in with family and friends for a movie night. We think you might enjoy watching The Mummy's Tomb because it's a classic adventure film with a captivating story that will keep you on the edge of your seat.\n\nThe Mummy's Tomb is a character-driven movie that is directed by Harold Young and features an older lead character played by Lon Chaney Jr. The movie is the final installment in Universal's Mummy series and it follows the story of the mummy Kharis, who is brought 

In [None]:
recommended_movie = 'Bend It Like Beckham'
output = pull_movie_keywords(recommended_movie, avg_impact_bytitle)

output = pd.DataFrame(output).T
output.columns = ["Positive", "Negative"]
pos_keywords = list(output["Positive"])
neg_keywords = list(output['Negative'])


response = openai.ChatCompletion.create(
  engine="Accelerator-Deployment-PersonalisedMarketing",
  messages = [{"role":"system","content":f"write a professional marketing email that will recommend a movie we specify below to the end user. Start with the "
  "prompt listed below, then factor in the movie recommendation below, and finally finetune the prompt to account for the positive or negative "
  "keywords we pass in as well to know the types of wording that is successful or not for that movie. add in the reasoning for why they "
  "should watch it, inserted after the because... in the prompt, based on what you know about the movie and those keywords passed in. "
  "Finally, rate how we did out of 10 for a marketing message that would drive conversion and where the keywords played a role in crafting the message."
  f"\n prompt: {prompt} . "
  f"movie recommendation: {recommended_movie} . "
  f"positive keywords: {pos_keywords} . "
  f"negative keywords (avoid): {neg_keywords}"}],
  temperature=0.7,
  max_tokens=800,
  top_p=0.95,
  frequency_penalty=0,
  presence_penalty=0,
  stop=None)
response

<OpenAIObject chat.completion id=chatcmpl-7bziOgDZq2KSi11Kwumf9tZ0AAiP5 at 0x78d2c42ea6b0> JSON: {
  "id": "chatcmpl-7bziOgDZq2KSi11Kwumf9tZ0AAiP5",
  "object": "chat.completion",
  "created": 1689289724,
  "model": "gpt-35-turbo",
  "choices": [
    {
      "index": 0,
      "finish_reason": "stop",
      "message": {
        "role": "assistant",
        "content": "Subject: Happy July 4th! Get Ready to Bend it Like Beckham\n\nDear [Name],\n\nHappy July 4th! We hope you had a fantastic day celebrating with your loved ones. As the night winds down, why not settle in with family and friends to watch an inspiring movie that will leave you feeling motivated and uplifted?\n\nWe think you might enjoy watching Bend It Like Beckham because it's a heartwarming story of a young girl who defies cultural expectations to pursue her dreams of playing soccer. The movie will keep you on the edge of your seat as you follow Jess's journey of self-discovery and triumph over adversity.\n\nBecause Bend It