In [6]:
import os
import kaggle
import pandas as pd
import openai
from dotenv import load_dotenv

# Ensure kaggle.json is in the right location
os.environ['KAGGLE_CONFIG_DIR'] = '/path/to/dir'  # Update with your path

# Example to download a dataset
# The dataset 'username/dataset' is just an example, replace with actual dataset
# kaggle.api.dataset_download_files('snap/amazon-fine-food-reviews', path='data/', unzip=True)

# We can directly upload our keys using a .env
load_dotenv()


True

In [7]:
df = pd.read_csv("data/Reviews.csv")
df_short = df.head(10)
df_short

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...


# SUMMARY
LLMs can efficiently summarize large volumes of text, such as reports, articles, or lengthy documents. This can be particularly useful for quickly understanding key points and themes in extensive data sets. 

In our case, it is way better to have a first summary of the review rather than the whole review. So, GPT can deal with it in seconds. 

And our only - and most important task - will be crafting a good prompt. 


In [8]:
# Summary funnction
def summary_maker(review):
    try:
        response = openai.completions.create(
            model="text-davinci-003",  # You can choose the model version
            prompt=f"Summarize the following review: \"{review}\" with a 3 words sentence.",
            max_tokens=60
        )

        sentiment = response.choices[0].text.strip()
        return sentiment
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [11]:
df_short["Summary_new"] = df_short.apply(lambda x:summary_maker(x["Text"]), axis=1)
df_short

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Summary_new
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,Good quality stew-like.
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Inaccurate product labeled.
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,"Delicious, delectable, decadent."
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,Medicinal cherry soda.
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,Delicious taffy cheap.
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...,Taffy- tasty treat.
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...,Taffy highly recommended.
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...,Delicious taffy!
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...,Cats love grass.
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...,Good for dogs.


# EXTRACT POLARITY
These models can be used for sentiment analysis, determining the tone and sentiment of text data such as customer reviews, social media posts, or feedback surveys. 

The most simple, yet most used, classification of all time is polarity. 

- Positive reviews or why are people happy with the product. 
- Negative reviews or why are they upset.
- Neutral or why people are indifferent with the product.

By analyzing these sentiments, businesses can gauge public opinion, customer satisfaction, and market trends. So, instead of having a person decide for each review, we can have our friend GPT to classify them for us. 


In [13]:
# Analyze sentiment function.
def analyze_sentiment(review):
    try:
        response = openai.completions.create(
            model="text-davinci-003",  # You can choose the model version
            prompt=f"Analyze the sentiment of this review: \"{review}\". Is it positive, negative, or neutral?",
            max_tokens=60
        )

        sentiment = response.choices[0].text.strip()
        return sentiment
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [14]:
df_short["Polarity"] = df_short.apply(lambda x:analyze_sentiment(x["Text"]), axis=1)
df_short

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short["Polarity"] = df_short.apply(lambda x:analyze_sentiment(x["Text"]), axis=1)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Summary_new,Polarity
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,Good quality stew-like.,Positive.
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Inaccurate product labeled.,Neutral.
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,"Delicious, delectable, decadent.",Positive.
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,Medicinal cherry soda.,Neutral
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,Delicious taffy cheap.,Positive
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...,Taffy- tasty treat.,Positive
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...,Taffy highly recommended.,Positive
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...,Delicious taffy!,Positive
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...,Cats love grass.,Neutral.
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...,Good for dogs.,Positive


# Thematic analysis
LLMs can identify and categorize themes or topics within large datasets. This is particularly useful for qualitative data analysis, where you might need to sift through vast amounts of text to understand common themes, trends, or patterns.

When analyzing reviews, it can be useful to understand the main purpose of the review. Some users will be complaining about something (service, quality, cost…), some users will be rating their experience with the product (either in a good or a bad way) and some others will be performing questions. 



In [19]:
# Analyze sentiment function.
def theme_analyzer(review):
    try:
        response = openai.completions.create(
            model="text-davinci-003",  # You can choose the model version
            prompt=f"""Analyze the following review: \"{review}\". 
            Is it a complain, a experience-based review or a question.
            Answer with a single word: 
            - Complain
            - Experience
            - Question""",
            max_tokens=60
        )

        sentiment = response.choices[0].text.strip()
        return sentiment
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [20]:
df_short["Theme"] = df_short.apply(lambda x:theme_analyzer(x["Text"]), axis=1)
df_short

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short["Theme"] = df_short.apply(lambda x:theme_analyzer(x["Text"]), axis=1)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Summary_new,Polarity,Theme
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,Good quality stew-like.,Positive.,Experience
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Inaccurate product labeled.,Neutral.,Complain
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,"Delicious, delectable, decadent.",Positive.,Experience
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,Medicinal cherry soda.,Neutral,Experience
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,Delicious taffy cheap.,Positive,Experience
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...,Taffy- tasty treat.,Positive,Experience
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...,Taffy highly recommended.,Positive,Experience
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...,Delicious taffy!,Positive,Experience
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...,Cats love grass.,Neutral.,Experience
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...,Good for dogs.,Positive,Experience


df_short["Polarity"] = df_short.apply(lambda x:analyze_sentiment(x["Text"]), axis=1)
df_short

# Keyword extraction
LLMs can be used to extract keywords. This means, detecting any element we ask for. 

Imagine for instance that we want to understand if the product where the review is attached is the product the user is talking about. To do so, we need to detect what product is the user reviewing. 
And again… we can ask our GPT model to find out the main product the user is talking about. 



In [21]:
# Analyze sentiment function.
def keyword_extraction(review):
    try:
        response = openai.completions.create(
            model="text-davinci-003",  # You can choose the model version
            prompt=f"""Analyze the following review: \"{review}\". 
            Identify the product that the user is reviewing and answer the product name with one or two words""",
            max_tokens=60
        )

        sentiment = response.choices[0].text.strip()
        return sentiment
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [22]:
df_short["Product"] = df_short.apply(lambda x:keyword_extraction(x["Text"]), axis=1)
df_short

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short["Product"] = df_short.apply(lambda x:keyword_extraction(x["Text"]), axis=1)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Summary_new,Polarity,Theme,Product
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,Good quality stew-like.,Positive.,Experience,:\n\nDog food
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Inaccurate product labeled.,Neutral.,Complain,Peanuts
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,"Delicious, delectable, decadent.",Positive.,Experience,.\n\nTurkish Delight
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,Medicinal cherry soda.,Neutral,Experience,.\n\nRobitussin
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,Delicious taffy cheap.,Positive,Experience,Taffy
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...,Taffy- tasty treat.,Positive,Experience,Taffy
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...,Taffy highly recommended.,Positive,Experience,.\n\nSaltwater Taffy
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...,Delicious taffy!,Positive,Experience,: \n\nTaffy
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...,Cats love grass.,Neutral.,Experience,.\n\nCat Grass
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...,Good for dogs.,Positive,Experience,.\n\nDog food.
