In [26]:
# !apt-get update
# !apt-get install libsasl2-dev libsasl2-2 libsasl2-modules-gssapi-mit
# !pip install amazon-codewhisperer-jupyterlab-ext --upgrade

##############################

# !pip install pymongo
# !pip install openai
# !pip install sasl
# !pip install thrift
# !pip install thrift-sasl
# !pip install PyHive
# !pip install SQLAlchemy
# !pip install fuzzywuzzy
# !pip install python-Levenshtein
# !pip install pandas

In [27]:
import openai
import pymongo
import pandas as pd
from pyhive import hive
from sqlalchemy.engine import create_engine
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from datetime import date

In [28]:
# Define configuration
openai_key = "api-key"
hive_host = "url"
hive_port = 10000
hive_username = "hive"
hive_url = "hive://hive@url:10000/default"
mongodb_url = "mongodb://root:password@url:27017/"
mongodb_database = "streamlit"
product_dataset = "product"

In [29]:
# define a function to call chat gpt 
def call_chat_gpt(query, openai_key):
    openai.api_key = openai_key
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
            {"role": "user", "content": query},
        ]
    )
    return response['choices'][0]['message']['content']

In [30]:
# get trending item from hive
# conn = hive.Connection(host=hive_host, port=hive_port, username=hive_username)
engine = create_engine(hive_url)

keywords_df = pd.read_sql("SELECT * FROM trending ORDER BY trend_date DESC LIMIT 1", engine)
product_df = pd.read_sql("SELECT * FROM {}".format(product_dataset), engine)
product2_df = pd.read_sql("SELECT * FROM product2", engine)

In [31]:
# Proprocessing on the keywords dataframe
keywords = keywords_df.loc[:, 'keywords'][0].replace('[', '').replace(']', '') #.replace("'", '').split(',')
print(keywords)

Malta vs England, Poland vs Germany, Tesla, F1, KidZania, Kick, Fall, Azizulhasni Awang, Nicolas Jackson, Boris Johnson, Black Clover: Sword of the Wizard King, Ross Butler, Ted Lasso, Kourtney Kardashian, France, Extraction 2, Happy Father's Day, Adipurush film, England vs Australia, Spirit


In [32]:
# remove the first row of product_df as it contains only column name
product_df = product_df.iloc[1:]
print(product_df.head())

   product_id                                       product_name  \
1         1.0  Samsung 7 kg Fully-Automatic Top Loading Washi...   
2         2.0  Samsung 7 Kg Inverter 5 Star Fully-Automatic T...   
3         3.0  Samsung 6.5 Kg 5 Star Inverter Fully-Automatic...   
4         4.0  LG 7 Kg 5 Star Inverter Fully-Automatic Top Lo...   
5         5.0  Samsung 7.0 Kg 5 Star Semi-Automatic Top Loadi...   

  product_main_category product_sub_category  \
1            appliances     Washing Machines   
2            appliances     Washing Machines   
3            appliances     Washing Machines   
4            appliances     Washing Machines   
5            appliances     Washing Machines   

                                  product_image_link  \
1  https://m.media-amazon.com/images/I/510mV2GAtk...   
2  https://m.media-amazon.com/images/I/61Ct6+KF4A...   
3  https://m.media-amazon.com/images/I/61Mt19diw9...   
4  https://m.media-amazon.com/images/I/71xTwKIBX-...   
5  https://m.media-ama

In [33]:
message = "Can you recommend product to buy based on the given word, be short and concise, using ':' as delimiter for each item during answer: "
message += keywords
# print(message)

gpt_response = call_chat_gpt(message, openai_key)
print(gpt_response)

Malta vs England: England
Poland vs Germany: Germany
Tesla: Model S
F1: Mercedes
KidZania: Roleplay
Kick: Soccer
Fall: Autumn
Azizulhasni Awang: Cycling
Nicolas Jackson: Actor
Boris Johnson: Prime Minister
Black Clover: Sword of the Wizard King: Anime
Ross Butler: Actor
Ted Lasso: TV show
Kourtney Kardashian: Reality star
France: Paris
Extraction 2: Action movie
Happy Father's Day: Gifts
Adipurush film: Bollywood
England vs Australia: Cricket
Spirit: Horse


In [34]:
# extract the possible product keywords from gpt response
product_keywords = []

for line in gpt_response.split('\n'):
    splitted_line = line.split(':')
    product_keywords.append(splitted_line[1].replace(' ', '', 1))
    
print(product_keywords)

['England', 'Germany', 'Model S', 'Mercedes', 'Roleplay', 'Soccer', 'Autumn', 'Cycling', 'Actor', 'Prime Minister', 'Sword of the Wizard King', 'Actor', 'TV show', 'Reality star', 'Paris', 'Action movie', 'Gifts', 'Bollywood', 'Cricket', 'Horse']


In [35]:
# loop through product_df to find similar product for given product_keyword
def find_similar_product(product_keyword, products_df):
    # max_ratio = 0
    # max_ratio_product = None
    recommended_product_df = pd.DataFrame(columns=["ratio_score", "product_id"])
    products_dict = products_df.to_dict('records') 
    
    for row in products_dict[:]:
        ratio = fuzz.token_sort_ratio(row['product_name'], product_keyword)
        # print(row['product_name'])
        # print(ratio)

        # if ratio > max_ratio:
        #     max_ratio = ratio
        #     max_ratio_product = row['product_id']

        if ratio > 60:
            recommended_product_df = pd.concat([recommended_product_df, pd.DataFrame({"ratio_score": [ratio], "product_id": [row['product_id']]})])
        
    return recommended_product_df.sort_values(by=['ratio_score'], ascending=False)

# find_similar_product(product_keywords[0], product_df.loc[:, ['product_id', 'product_name']])

In [36]:
# save to mongodb function
def save_to_mongodb(item_to_insert, mongodb_url, database_name, collection_name):
    client = pymongo.MongoClient(mongodb_url)
    db = client[database_name]
    collection = db[collection_name]

    # Insert or update the product in the MongoDB collection
    result = collection.insert_many(item_to_insert)
    client.close()
    
    return result

In [37]:
def drop_mongodb_collection(mongodb_url, database_name, collection_name):
    client = pymongo.MongoClient(mongodb_url)
    db = client[database_name]
    collection = db[collection_name]

    # Drop the collection
    result = collection.drop()
    client.close()

In [38]:
# loop through product_keywords to find similar product for all keywords
def find_all_similar_product(product_keywords, product_df):
    all_similar_product_df = pd.DataFrame(columns=["ratio_score", "product_id"])

    for idx, product_keyword in enumerate(product_keywords):

        # find similar product based on product_keyword
        similar_product_df = find_similar_product(product_keyword, product_df)
        all_similar_product_df = pd.concat([all_similar_product_df, similar_product_df])

        if idx % 5 == 0:
            print('{} % completed. Loading.'.format(idx / 20 * 100))

        if idx == len(product_keywords) - 1:
            print('100 % completed. Done')
    
    return all_similar_product_df

In [39]:
all_similar_product_id_df = find_all_similar_product(product_keywords, product_df.loc[:, ['product_id', 'product_name']])

0.0 % completed. Loading.
25.0 % completed. Loading.
50.0 % completed. Loading.
75.0 % completed. Loading.
100 % completed. Done


In [40]:
all_similar_product_id_list = all_similar_product_id_df.loc[:, 'product_id'].to_list()
print(all_similar_product_id_list)

[169119.0, 158309.0]


In [41]:
recommended_product_list = []

for product_id in all_similar_product_id_list:
    item = product_df[product_df['product_id']==product_id].to_dict('records')
    recommended_product_list.append(item[0])
    
# recommended_product_list

In [42]:
processed_keywords = {}
processed_keywords["date"] = keywords_df.loc[:, 'trend_date'][0]
processed_keywords["Keywords"] = keywords_df.loc[:, 'keywords'][0].replace('[', '').replace(']', '').split(', ')
# print(processed_keywords)

In [43]:
# Save to Mongodb
drop_mongodb_collection(mongodb_url, mongodb_database, "recommended_product_1")
drop_mongodb_collection(mongodb_url, mongodb_database, "trending_item")

insert_recommended_product_result = save_to_mongodb(recommended_product_list, mongodb_url, mongodb_database, "recommended_product_1")
insert_trending_item_result = save_to_mongodb([processed_keywords], mongodb_url, mongodb_database, "trending_item")
insert_gpt_response_result = save_to_mongodb([{'gpt_response': product_keywords, 'date': date.today().strftime("%Y-%m-%d")}], mongodb_url, mongodb_database, "gpt_response")

In [44]:
##################################################################
# This section is for product2
product2_df = product2_df.iloc[1:]

all_similar_product2_id_df = find_all_similar_product(product_keywords, product2_df.loc[:, ['product_id', 'product_name']])
all_similar_product2_id_list = all_similar_product2_id_df.loc[:, 'product_id'].to_list()

recommended_product2_list = []

for product_id in all_similar_product2_id_list:
    item = product2_df[product2_df['product_id']==product_id].to_dict('records')
    recommended_product2_list.append(item[0])


drop_mongodb_collection(mongodb_url, mongodb_database, "recommended_product_2")
insert_recommended_product2_result = save_to_mongodb(recommended_product2_list, mongodb_url, mongodb_database, "recommended_product_2")

0.0 % completed. Loading.
25.0 % completed. Loading.
50.0 % completed. Loading.
75.0 % completed. Loading.
100 % completed. Done


In [45]:
# Logging
with open('recommended_product_mongodb_result.log', 'a') as f:
    f.write("Date: {} \t Insert Recommended Product: {} \t Insert Trending Item: {} \t Insert Trending Item 2: {} \n"
            .format(date.today().strftime("%Y-%m-%d"), insert_recommended_product_result.acknowledged, 
                    insert_trending_item_result.acknowledged, insert_recommended_product2_result.acknowledged))

In [46]:
# if __name__ == '__main__':
#     print("hello world")

In [47]:


# import requests
# import pymongo

# # Task 1: Call GPT API for product recommendation
# def call_gpt_api(query):
#     api_key = "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"
#     api_endpoint = "https://api.openai.com/v1/engines/davinci-codex/completions"
    
#     headers = {
#         "Authorization": f"Bearer {api_key}",
#         "Content-Type": "application/json"
#     }
    
#     data = {
#         "prompt": query,
#         "max_tokens": 50,  # Adjust the value as needed
#         "temperature": 0.7  # Adjust the value as needed
#     }
    
#     response = requests.post(api_endpoint, headers=headers, json=data)
#     if response.status_code == 200:
#         return response.json()["choices"][0]["text"].strip()
#     else:
#         raise Exception("Failed to call GPT API")

# # Task 2: Find relevant product in data warehouse
# def find_relevant_product(feedback):
#     # Connect to the Hadoop data warehouse 
    
#     # Execute the necessary query on the data warehouse
#     # to find the relevant product based on the feedback
    
#     relevant_product = None  # Replace with the retrieved product information
    
#     return relevant_product

# # Task 3: Save recommended product to MongoDB
# def save_to_mongodb(product):
#     client = pymongo.MongoClient("mongodb://localhost:27017/")
#     db = client["your_database_name"]
#     collection = db["your_collection_name"]
    
#     # Transform the product data if needed
    
#     # Insert or update the product in the MongoDB collection
#     collection.insert_one(product)  # Replace with appropriate insert/update operation
    
#     client.close()

# # Example usage
# query = "Can you recommend a romantic movie?"

# # Task 1: Call GPT API
# recommended_product = call_gpt_api(query)

# # Task 2: Find relevant product
# relevant_product = find_relevant_product(recommended_product)

# # Task 3: Save recommended product to MongoDB
# save_to_mongodb(relevant_product)


In [48]:
# def call_gpt_api(query):
#     api_key = "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"
#     api_endpoint = "https://api.openai.com/v1/engines/davinci-codex/completions"
    
#     headers = {
#         "Authorization": f"Bearer {api_key}",
#         "Content-Type": "application/json"
#     }
    
#     data = {
#         "prompt": query,
#         "max_tokens": 50,
#         "temperature": 0.7
#     }
    
#     response = requests.post(api_endpoint, headers=headers, json=data)
#     if response.status_code == 200:
#         return response.json()["choices"][0]["text"].strip()
#     else:
#         raise Exception("Failed to call GPT API")

# # Example usage
# query = "Can you recommend a laptop under $1000 for gaming?"
# recommended_product = call_gpt_api(query)
