In [38]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
from transformers import pipeline

def preprocess(df, column = 'review'):

    if column == 'review':
      #drop duplicate rows
      df = df.drop_duplicates()

      #drop rows where na values present in parts column
      if 'parts' in df.columns:
        df = df.dropna(subset=['parts'])

    # lowercase the text
    df[column] = df[column].str.lower()
    
    #add septoken
    if 'parts' in df.columns and column == 'parts':
        df[column] = df[column].apply(lambda x: str(x).replace("...", " septoken ") if '...' in str(x) else str(x) + " septoken ")

    # remove special characters and numbers
    df[column] = df[column].apply(lambda x: re.sub(r'[^\w\s]+', '', str(x)))

    # remove underscores
    df[column] = df[column].apply(lambda x: x.replace("_", "") if '_' in x else x)

    # tokenize the reviews
    df[column] = df[column].apply(lambda x: word_tokenize(x))
    
    
    return df

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rajes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [39]:
# Function to remove special characters and spaces from a string
def clean_string(string):
    cleaned_string = re.sub(r'[^\w]', '', string) # Removes special characters and spaces including underscores
    return cleaned_string

In [40]:
def get_processed_sentence(sentence, label):
    # Create a dataframe from the sentence and label inputs
    test_df = pd.DataFrame({'review':[sentence], 'label':[label]})
    # Preprocess the dataframe
    test_df = preprocess(test_df)
    # Join the review column in the dataframe into a single string
    sentence = " ".join(test_df['review'][0])
    # Clean the label column in the dataframe
    label = clean_string(test_df['label'][0])
    # Concatenate the processed sentence and label into a single string
    sentence = sentence + " " + label
    # Return the combined string
    return sentence


In [41]:
def get_predictions_sentence(sentence, label, model_checkpoint):
    # Initialize an empty string to store the processed sentence
    sent = ''
    
    # Get the processed sentence by calling the get_processed_sentence function
    sentence = get_processed_sentence(sentence, label)
    
    # Initialize a token-classification pipeline using the specified model checkpoint
    token_classifier = pipeline( "token-classification", model=model_checkpoint, aggregation_strategy="simple")
    
    # Get the predictions from the pipeline on the processed sentence
    predictions = token_classifier(sentence)
    
    # Loop through the predicted entities in the sentence
    for entities in predictions:
        # Check if there are any entities in the current prediction
        if len(entities) > 0:
            # Get the word and score of the current entity
            pred_sent = entities['word']
            score = entities['score']
            # Check if the score is higher than 0.85
            if score > 0.85:
                # Add the predicted word to the final sentence
                sent = sent + pred_sent + '...'
    return sent


In [49]:
sentence = "i got this as a small sample and really loved it. it’s very easy to blend and build up coverage if needed. doesn’t look cakey, looks nice and natural on the skin. i love this tinted moisturiser, would definitely recommend trying it out"
label ="moisturiser(positive)"

#Trained model checkpoint
model_checkpoint = "bert-finetuned-ner/checkpoint-118380"
prediction = get_predictions_sentence(sentence, label, model_checkpoint)

In [50]:
print(prediction)

i love this tinted moisturiser...


In [51]:
import pandas as pd

# specify the filepath of the Excel file
filepath = r'Full Data For Text Extraction.xlsx'

# read the Excel file into a DataFrame
data = pd.read_excel(filepath)

# display the first five rows of the DataFrame

In [58]:
#test_df = data
test_df = data[300000:300500]

In [59]:
test_df.shape

(500, 6)

In [60]:
test_df.head(10)

Unnamed: 0,review_text_index,review,keywords,label,parts,Unnamed: 5
300000,79933,"While I applaud GoTo for making a physical, ...","summer, winter",season,; it was summer... In winter it's okay,
300001,79933,"While I applaud GoTo for making a physical, ...",thick,thick,"it's a bit too thick for my skin, which is pro...",
300002,79934,While I do find this is a hydrating product ...,circle,can't cover eye circles,haven't noticed any improvements in my fine li...,
300003,79934,While I do find this is a hydrating product ...,"hydrat, hydrat",hydrating/moisturising,While I do find this is a hydrating product an...,
300004,79934,While I do find this is a hydrating product ...,month,long term use,While I do find this is a hydrating product an...,
300005,79934,While I do find this is a hydrating product ...,month,months (negative),While I do find this is a hydrating product an...,
300006,79934,While I do find this is a hydrating product ...,fine,no difference (wrinkles),haven't noticed any improvements in my fine li...,
300007,79935,While I do like this product I feel it is t ...,moistur,average hydration/moisture,I perhaps need one that is more moisturising f...,
300008,79936,While I found this hydrating and it felt coo...,cool,cooling,it felt cool,
300009,79936,While I found this hydrating and it felt coo...,smell,fragrance (neutral/positive),smells amazing,


In [61]:
def get_predictions(test_df, model_chechpoint):
    # Initialize an empty list to store the predicted sentences
    Predicted_Part = []
    # Loop through the rows of the test dataframe
    for index, row in test_df.iterrows():
        # Get the sentence and label from the current row
        sentence = row['review']
        label = row['label']
        # Get the prediction for the current sentence by calling the get_predictions_sentence function
        prediction = get_predictions_sentence(sentence, label, model_checkpoint)
        # Add the prediction to the list of predicted sentences
        Predicted_Part.append(prediction)
        # Print the list of predicted sentences (for debugging purposes)
    # Add the list of predicted sentences as a new column in the test dataframe
    test_df['predictions'] = Predicted_Part
    # Return the updated test dataframe
    return test_df

# Get predictions for the test dataframe using the specified model checkpoint
pred_df = get_predictions(test_df, model_checkpoint)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['predictions'] = Predicted_Part


In [62]:
pred_df[['label', 'parts', 'predictions']]

Unnamed: 0,label,parts,predictions
300000,season,; it was summer... In winter it's okay,
300001,thick,"it's a bit too thick for my skin, which is pro...",its a bit too thick for my skin which is proba...
300002,can't cover eye circles,haven't noticed any improvements in my fine li...,havent noticed any improvements in my fine lin...
300003,hydrating/moisturising,While I do find this is a hydrating product an...,this is a hydrating product...id recommend thi...
300004,long term use,While I do find this is a hydrating product an...,ive been using it for 3 months...
...,...,...,...
300495,hydrating/moisturising,"With aging skin, I love the extra hydration th...",with aging skin i love the extra hydration thi...
300496,caused pimples/breakouts,Plus my skin became a little bumpy,my skin became a little bumpy which happens to...
300497,caused peeling/flaking,then I woke up the following morning with litt...,then i woke up the following morning with litt...
300498,cleansing,give this a go after my trusty Properly Clean ...,


In [63]:
pred_df.to_csv('Predictions1.csv')