### Importing required libraries

In [1]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from transformers import CLIPProcessor, CLIPModel

  from .autonotebook import tqdm as notebook_tqdm


### Loading Dataset and Preprocessing

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
review_df = pd.read_csv('./Reviews.csv')
review_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [5]:
review_df['Review'] = review_df['Summary'].astype(str) + " " + review_df['Text'].astype(str)
review_df['Review'] = review_df['Review'].apply(lambda x: x.lower())

In [6]:
words = review_df['Review'].str.split()
    
# Join the first n words of each sentence together using the apply() function
first_n_words = words.apply(lambda x: ' '.join(x[:40]))

# Replace the original column with the new column containing the first n words
review_df['Review_CLIP'] = first_n_words

In [7]:
review_df['Review_CLIP'] = review_df['Review_CLIP'].replace('[^\w\s]|_', '', regex=True)
review_df['Review_CLIP'] = review_df['Review_CLIP'].replace(',', '', regex=True)
review_df['Review_CLIP'] = review_df['Review_CLIP'].replace("'", "", regex=True)

In [8]:
subset = review_df[review_df["Review_CLIP"].apply(lambda x : 4 < len(str(x).split()) < 40)]
subset["Review_CLIP"] = subset["Review_CLIP"].apply(lambda x : " ".join(x.split()))
subset['Review_CLIP'] = subset['Review_CLIP'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset["Review_CLIP"] = subset["Review_CLIP"].apply(lambda x : " ".join(x.split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['Review_CLIP'] = subset['Review_CLIP'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)


In [9]:
subset.isna().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               10
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
Review                     0
Review_CLIP                0
dtype: int64

In [10]:
review_df.shape

(568454, 12)

### Loading CLIP

In [11]:
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

In [12]:
device = "cpu"
model.to(device)

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0): CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05, element

### Creating Embedidngs

In [13]:
"""
Creating embeddings for the sentiment that the review indicates
"""

'\nCreating embeddings for the sentiment that the review indicates\n'

In [14]:
sentiment_list = ['positive', 'negative', 'neutral']
sentiment_list = [f'{sentiment} review' for sentiment in sentiment_list]

In [15]:
sentiment_embeddings = processor(
    text=sentiment_list,
    padding=True,
    images=None,
    return_tensors='pt'
).to(device)
sentiment_embeddings = model.get_text_features(**sentiment_embeddings)
sentiment_embeddings = sentiment_embeddings.detach().numpy() / np.linalg.norm(sentiment_embeddings.detach().numpy(), ord=2, axis=-1, keepdims=True)

In [16]:
import time
import torch
# Use the fabric label embeddings to predict the fabric type for each description
start_time = time.time()
batch_size = 32  
sentiment_output = []

review_description = list(subset['Review_CLIP'])

for i in range(0, len(review_description), batch_size):
    try:
        batch = review_description[i:i+batch_size]
        description_encode = processor(
            text=batch,
            padding=True,
            images=None,
            return_tensors='pt'
        ).to(device)

        description_encode = model.get_text_features(**description_encode)
        description_encode = description_encode.detach().numpy() / np.linalg.norm(description_encode.detach().numpy(), ord=2, axis=-1, keepdims=True)
        predicted_classes_distribution = np.dot(sentiment_embeddings, description_encode.T)
        predicted = [sentiment_list[k] for k in np.argmax(predicted_classes_distribution, axis=0)]
        
        # check that predicted has same length as batch
        if len(predicted) == len(batch):
            sentiment_output.extend(predicted)
        else:
            # if predicted has different length, fill invalid values
            sentiment_output.extend(['invalid']*len(batch))
    except RuntimeError:
        try:
            batch = [" ".join(desc.split()[:25]) for desc in review_description[i:i+batch_size]]
            description_encode = processor(
                text=batch,
                padding=True,
                images=None,
                return_tensors='pt'
            ).to(device)

            description_encode = model.get_text_features(**description_encode)
            description_encode = description_encode.detach().numpy() / np.linalg.norm(description_encode.detach().numpy(), ord=2, axis=-1, keepdims=True)
            predicted_classes_distribution = np.dot(sentiment_embeddings, description_encode.T)
            predicted = [sentiment_list[k] for k in np.argmax(predicted_classes_distribution, axis=0)]
            
            # check that predicted has same length as batch
            if len(predicted) == len(batch):
                sentiment_output.extend(predicted)
            else:
                # if predicted has different length, fill invalid values
                sentiment_output.extend(['invalid']*len(batch))
        except RuntimeError:
            batch = review_description[i:i+batch_size]
            for b in batch:
                try:
                    description_encode = processor(
                        text=b,
                        padding=True,
                        images=None,
                        return_tensors='pt'
                    ).to(device)
                    description_encode = model.get_text_features(**description_encode)
                    description_encode = description_encode.detach().numpy() / np.linalg.norm(description_encode.detach().numpy(), ord=2, axis=-1, keepdims=True)
                    predicted_classes_distribution = np.dot(sentiment_embeddings, description_encode.T)
                    predicted = [sentiment_list[k] for k in np.argmax(predicted_classes_distribution, axis=0)]
                    
                    # check that predicted has same length as batch
                    if len(predicted) == 1:
                        sentiment_output.append(predicted[0])
                    else:
                        sentiment_output.append('invalid')
                except:
                     sentiment_output.append('invalid')


Token indices sequence length is longer than the specified maximum sequence length for this model (80 > 77). Running this sequence through the model will result in indexing errors


In [17]:
len(sentiment_output)

216943

In [18]:
subset['sentiment'] = sentiment_output

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['sentiment'] = sentiment_output


In [19]:
subset['sentiment'].unique()

array(['neutral review', 'positive review', 'negative review', 'invalid'],
      dtype=object)

In [21]:
subset

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Review,Review_CLIP,sentiment
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,not as advertised product arrived labeled as j...,not as advertised product arrived labeled as j...,neutral review
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,"""delight"" says it all this is a confection tha...",delight says it all this is a confection that ...,positive review
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,great taffy great taffy at a great price. the...,great taffy great taffy at a great price there...,positive review
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...,"wonderful, tasty taffy this taffy is so good. ...",wonderful tasty taffy this taffy is so good it...,positive review
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...,yay barley right now i'm mostly just sprouting...,yay barley right now im mostly just sprouting ...,neutral review
...,...,...,...,...,...,...,...,...,...,...,...,...,...
568441,568442,B000NY8O9M,AZRHU8CP5XKMF,David L. Brown,0,0,5,1264204800,Great For Fast Gulasch!,Quick and easy! Had similar Gulasch in Guest H...,great for fast gulasch! quick and easy! had si...,great for fast gulasch quick and easy had simi...,positive review
568442,568443,B006T7TKZO,A3BOURUK79CYY5,BIH,0,0,5,1338854400,Great Cafe Latte,This product is great. Gives you so much ener...,great cafe latte this product is great. gives...,great cafe latte this product is great gives y...,positive review
568448,568449,B001EO7N10,A1F6BHEYB7R6R7,James Braley,0,0,5,1308096000,Very large ground spice jars.,My only complaint is that there's so much of i...,very large ground spice jars. my only complain...,very large ground spice jars my only complaint...,positive review
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...,will not do without great for sesame chicken.....,will not do without great for sesame chickenth...,positive review


In [20]:
subset[subset['sentiment'] == 'negative review']

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Review,Review_CLIP,sentiment
23,24,B001GVISJM,AJ613OLZZUG7V,Mare's,0,0,5,1304467200,Twizzlers,I love this candy. After weight watchers I ha...,twizzlers i love this candy. after weight wat...,twizzlers i love this candy after weight watch...,negative review
26,27,B001GVISJM,A3RXAU2N8KV45G,lady21,0,1,1,1332633600,Nasty No flavor,"The candy is just red , No flavor . Just plan...","nasty no flavor the candy is just red , no fla...",nasty no flavor the candy is just red no flavo...,negative review
53,54,B000G6RPMY,AQ9DWWYP2KJCQ,"Roel Trevino ""protomex""",0,0,3,1278028800,not ass kickin,we're used to spicy foods down here in south t...,not ass kickin we're used to spicy foods down ...,not ass kickin were used to spicy foods down h...,negative review
56,57,B004N5KULM,A202WR509428VF,amateur amazon shopper,2,2,5,1322438400,Awesome Deal!,Deal was awesome! Arrived before Halloween as...,awesome deal! deal was awesome! arrived befor...,awesome deal deal was awesome arrived before h...,negative review
60,61,B004N5KULM,A1ZR8O62VSU4OK,"Lisa J. Szlosek ""lisa""",2,4,3,1318723200,Better price for this at Target,Watch your prices with this. While the assort...,better price for this at target watch your pri...,better price for this at target watch your pri...,negative review
...,...,...,...,...,...,...,...,...,...,...,...,...,...
568353,568354,B003O5Q3KE,A2U4L18LOGSHQ1,"Larry L. Wieskamp ""larwie""",0,0,2,1291161600,doggy crackers,These were kind of expensive for a dog treat a...,doggy crackers these were kind of expensive fo...,doggy crackers these were kind of expensive fo...,negative review
568403,568404,B001EQ5O6Y,A2891E3BMAKGYN,"PCNiles ""reader/writer""",0,6,2,1264118400,"Deceptive Term = ""Sticks""","When I ordered these, based on the description...","deceptive term = ""sticks"" when i ordered these...",deceptive term sticks when i ordered these bas...,negative review
568415,568416,B000JT45IA,AEW1UWTI3MCVN,"John S ""Johnny""",0,0,5,1177545600,Hands down the best dressing I've ever had...,as it should be for $6 a bottle.,hands down the best dressing i've ever had... ...,hands down the best dressing ive ever had as i...,negative review
568430,568431,B001FPT1WM,A1XDMZMMOAMR7,nyxport,0,0,5,1345161600,delicious,This product is a bit pricey for the amt. rece...,delicious this product is a bit pricey for the...,delicious this product is a bit pricey for the...,negative review
