In [285]:
# RAG-based Search and Analysis using LangChain

import pandas as pd
import numpy as np
from langchain import OpenAI, VectorDBQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
import os

In [286]:
# Load datasets
laptop_data = pd.read_csv('./CSV/laptop_data.csv')
fashion_data = pd.read_csv('./CSV/Fashion Dataset v2.csv')
email_summary_data = pd.read_csv('./CSV/email_thread_summaries.csv')
email_details_data = pd.read_csv('./CSV/email_thread_details.csv')

In [287]:
# Prefix conflicting columns
laptop_data = laptop_data.rename(columns={'Brand': 'Laptop_Brand', 'Price': 'Laptop_Price'})
fashion_data = fashion_data.rename(columns={'brand': 'Fashion_Brand', 'price': 'Fashion_price'})

In [288]:
# Display dataset samples
print("Laptop Data Sample:")
print(laptop_data.head())
print("\nFashion Data Sample:")
print(fashion_data.head())
print("\nEmail Summary Data Sample:")
print(email_summary_data.head())

Laptop Data Sample:
  Laptop_Brand   Model Name Core CPU Manufacturer Clock Speed RAM Size  \
0         Dell     Inspiron   i5            Intel     2.4 GHz      8GB   
1          MSI         GL65   i7            Intel     2.6 GHz     16GB   
2           HP    EliteBook   i7            Intel     2.8 GHz     16GB   
3       Lenovo      IdeaPad   i3            Intel     2.1 GHz      8GB   
4         ASUS  ZenBook Pro   i9            Intel     3.1 GHz     64GB   

  Storage Type Display Type Display Size Graphics Processor Screen Resolution  \
0          SSD          LCD        15.6"          Intel UHD         1920x1080   
1      HDD+SSD          IPS        15.6"         NVIDIA GTX         1920x1080   
2          SSD          LED          14"          Intel UHD         1920x1080   
3          HDD           TN        15.6"          Intel UHD          1366x768   
4          SSD         OLED        15.6"         NVIDIA RTX         3840x2160   

           OS Laptop Weight    Special Features 

In [289]:
# Initialize HuggingFace embeddings
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [290]:
# Combine data for embedding
laptop_data['combined_text'] = laptop_data.apply(lambda x: f"{x['Laptop_Brand']} {x['Graphics Processor']} {x['Laptop_Price']} {x['RAM Size']} {x['Storage Type']} {x['Graphics Processor']}", axis=1)
fashion_data['combined_text'] = fashion_data.apply(lambda x: f"{x['Fashion_Brand']} {x['products']} {x['name']} {x['Fashion_price']} {x['ratingCount']}", axis=1)
email_data = pd.concat([email_summary_data, email_details_data], axis=0)
email_data['combined_text'] = email_data.apply(lambda x: f"{x['subject']} {x['summary']} {x['body']}", axis=1)

In [291]:
laptop_data

Unnamed: 0,Laptop_Brand,Model Name,Core,CPU Manufacturer,Clock Speed,RAM Size,Storage Type,Display Type,Display Size,Graphics Processor,Screen Resolution,OS,Laptop Weight,Special Features,Warranty,Average Battery Life,Laptop_Price,Description,combined_text
0,Dell,Inspiron,i5,Intel,2.4 GHz,8GB,SSD,LCD,"15.6""",Intel UHD,1920x1080,Windows 10,2.5 kg,Backlit Keyboard,1 year,6 hours,35000,The Dell Inspiron is a versatile laptop that c...,"Dell Intel UHD 35,000 8GB SSD Intel UHD"
1,MSI,GL65,i7,Intel,2.6 GHz,16GB,HDD+SSD,IPS,"15.6""",NVIDIA GTX,1920x1080,Windows 10,2.3 kg,RGB Keyboard,2 years,4 hours,55000,The MSI GL65 is a high-performance laptop desi...,"MSI NVIDIA GTX 55,000 16GB HDD+SSD NVIDIA GTX"
2,HP,EliteBook,i7,Intel,2.8 GHz,16GB,SSD,LED,"14""",Intel UHD,1920x1080,Windows 11,1.5 kg,Fingerprint Sensor,3 years,8 hours,90000,The HP EliteBook is a premium laptop designed ...,"HP Intel UHD 90,000 16GB SSD Intel UHD"
3,Lenovo,IdeaPad,i3,Intel,2.1 GHz,8GB,HDD,TN,"15.6""",Intel UHD,1366x768,Windows 10,2.2 kg,Dolby Audio,1 year,5 hours,25000,The Lenovo IdeaPad is a versatile laptop that ...,"Lenovo Intel UHD 25,000 8GB HDD Intel UHD"
4,ASUS,ZenBook Pro,i9,Intel,3.1 GHz,64GB,SSD,OLED,"15.6""",NVIDIA RTX,3840x2160,Windows 10,1.8 kg,NanoEdge Display,2 years,7 hours,200000,The ASUS ZenBook Pro is a high-end laptop that...,"ASUS NVIDIA RTX 200,000 64GB SSD NVIDIA RTX"
5,Acer,Predator,i7,Intel,2.8 GHz,16GB,SSD,IPS,"17.3""",NVIDIA GTX,1920x1080,Windows 10,3.2 kg,Dual Cooling Fans,1 year,5 hours,80000,The Acer Predator is a powerhouse laptop desig...,"Acer NVIDIA GTX 80,000 16GB SSD NVIDIA GTX"
6,Microsoft,Surface Laptop,i5,Intel,1.6 GHz,8GB,SSD,PixelSense,"13.5""",Intel Iris Plus,2256x1504,Windows 11,1.3 kg,Touchscreen Display,1 year,10 hours,90000,The Microsoft Surface Laptop is a premium devi...,"Microsoft Intel Iris Plus 90,000 8GB SSD Intel..."
7,Lenovo,ThinkPad,Ryzen 7,AMD,3.0 GHz,16GB,SSD,IPS,"14""",NVIDIA GTX,2560x1440,Linux,1.6 kg,Backlit Keyboard,3 years,6 hours,60000,The Lenovo ThinkPad is a powerful laptop desig...,"Lenovo NVIDIA GTX 60,000 16GB SSD NVIDIA GTX"
8,HP,Pavilion,i5,Intel,2.3 GHz,12GB,HDD,LCD,"15.6""",Intel UHD,1366x768,Windows 10,2.1 kg,B&O Audio,1 year,4 hours,30000,The HP Pavilion is a budget-friendly laptop th...,"HP Intel UHD 30,000 12GB HDD Intel UHD"
9,ASUS,ROG Strix G,i7,Intel,2.9 GHz,16GB,SSD,IPS,"17.3""",NVIDIA RTX,1920x1080,Windows 10,2.9 kg,Aura Sync RGB Keyboard,2 years,5 hours,85000,The ASUS ROG Strix G is a high-performance gam...,"ASUS NVIDIA RTX 85,000 16GB SSD NVIDIA RTX"


In [292]:
fashion_data

Unnamed: 0,p_id,name,products,Fashion_price,colour,Fashion_Brand,img,ratingCount,avg_rating,description,p_attributes,combined_text
0,17048614,Khushal K Women Black Ethnic Motifs Printed Ku...,"Kurta, Palazzos, Dupatta",5099.0,Black,Khushal K,http://assets.myntassets.com/assets/images/170...,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32...","Khushal K Kurta, Palazzos, Dupatta Khushal K W..."
1,16524740,InWeave Women Orange Solid Kurta with Palazzos...,"Kurta, Palazzos, Floral Print Dupatta",5899.0,Orange,InWeave,http://assets.myntassets.com/assets/images/165...,1081.0,4.119334,Orange solid Kurta with Palazzos with dupatta<...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32...","InWeave Kurta, Palazzos, Floral Print Dupatta ..."
2,16331376,Anubhutee Women Navy Blue Ethnic Motifs Embroi...,"Kurta, Trousers, Dupatta",4899.0,Navy Blue,Anubhutee,http://assets.myntassets.com/assets/images/163...,1752.0,4.161530,Navy blue embroidered Kurta with Trousers with...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ...","Anubhutee Kurta, Trousers, Dupatta Anubhutee W..."
3,14709966,Nayo Women Red Floral Printed Kurta With Trous...,"Kurta, Trouser, Dupatta",3699.0,Red,Nayo,http://assets.myntassets.com/assets/images/147...,4113.0,4.088986,Red printed kurta with trouser and dupatta<br>...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ...","Nayo Kurta, Trouser, Dupatta Nayo Women Red Fl..."
4,11056154,AHIKA Women Black & Green Printed Straight Kurta,Kurta,1350.0,Black,AHIKA,http://assets.myntassets.com/assets/images/110...,21274.0,3.978377,"Black and green printed straight kurta, has a ...","{'Body Shape ID': '424', 'Body or Garment Size...",AHIKA Kurta AHIKA Women Black & Green Printed ...
...,...,...,...,...,...,...,...,...,...,...,...,...
14209,15415116,Flying Machine Women Blue Solid Mock-Collar Fr...,Sweatshirt,2299.0,Blue,Flying Machine,http://assets.myntassets.com/assets/images/154...,13.0,4.076923,Blue solid front-open sweatshirt has a mock co...,"{'Body Shape ID': '443,424,324', 'Body or Garm...",Flying Machine Sweatshirt Flying Machine Women...
14210,16470114,Juelle Women Green Printed Hooded Sweatshirt,Hooded Sweatshirt,2299.0,Green,Juelle,http://assets.myntassets.com/assets/images/164...,,,"Green printed sweatshirt has a hooded, 2 pock...","{'Body Shape ID': '443,424,324', 'Body or Garm...",Juelle Hooded Sweatshirt Juelle Women Green Pr...
14211,16382150,Vero Moda Women Pink Sweatshirt,Sweatshirt,2299.0,Pink,Vero Moda,http://assets.myntassets.com/assets/images/163...,,,"Pink solid sweatshirt has a mock collar, 2 ka...","{'Body Shape ID': '443,424,324', 'Body or Garm...",Vero Moda Sweatshirt Vero Moda Women Pink Swea...
14212,16379664,Vero Moda Women Blue Sweatshirt,Sweatshirt,2299.0,Blue,Vero Moda,http://assets.myntassets.com/assets/images/163...,,,"Blue solid sweatshirt has a round neck, long s...","{'Body Shape ID': '443,424,324', 'Body or Garm...",Vero Moda Sweatshirt Vero Moda Women Blue Swea...


In [293]:
# Create and store embeddings using Chroma
laptop_db = Chroma.from_texts(laptop_data['combined_text'].tolist(), embedding_model)
fashion_db = Chroma.from_texts(fashion_data['combined_text'].tolist(), embedding_model)
email_db = Chroma.from_texts(email_data['combined_text'].tolist(), embedding_model)


In [294]:
laptop_db

<langchain_community.vectorstores.chroma.Chroma at 0x3fd6bfbb0>

In [None]:
# Set API Key (Make sure to set your actual key)
os.environ["OPENAI_API_KEY"] = ""

In [296]:
# Perform search using LangChain
llm = OpenAI()
# Example Chain
#combine_documents_chain = StuffDocumentsChain(llm=llm)
# laptop_qa = RetrievalQA(llm=llm,retriever=laptop_db.as_retriever())
# fashion_qa = RetrievalQA(llm=llm, retriever=fashion_db.as_retriever())
# email_qa = RetrievalQA(llm=llm, retriever=email_db.as_retriever())

In [297]:
# Ensure the retriever is correctly set
laptop_retriever = laptop_db.as_retriever()
fashion_retriever = fashion_db.as_retriever()
email_retriever = email_db.as_retriever()

In [298]:
# Initialize RetrievalQA
laptop_qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=laptop_retriever)
fashion_qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=fashion_retriever)
email_qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=email_retriever)

In [301]:
# Example Queries
print("\nLaptop Search Result:")
print(laptop_qa.run("Recommend a gaming laptop with 16GB RAM and SSD storage"))






Laptop Search Result:
 The Acer NVIDIA GTX 80,000 with 16GB of RAM and SSD storage would be a great option for gaming performance.


In [303]:
print("\nEmail Search Result:")

print(email_qa.run('Find discussion about Gas'))


Email Search Result:
 The email suggests that there was a discussion about natural gas in California and that the sender, Jennifer, has made revisions to a document based on this discussion.


In [304]:
print("\nFashion Search Result:")
print(fashion_qa.run("Suggest cotton dresses under $5000"))


Fashion Search Result:
 I'm sorry, I don't know the answer to that question.


In [None]:
print("\nFashion Search Result:")
print(fashion_qa.run("Suggest summer cotton dress in color yellow"))





Fashion Search Result:
 Some potential summer cotton dress options in yellow could include a sleeveless A-line dress, a flowy maxi dress, or a casual sundress with spaghetti straps.

Fashion Search Result:
 I'm sorry, I don't have enough information to suggest a specific summer dress in yellow for less than $500. The context provided only mentions an unstitched dress material, not a specific dress or price range. 


In [308]:
print("\nFashion Search Result:")
print(fashion_qa.run("Suggest summer dress in color yellow"))


Fashion Search Result:
 It is likely that the Stylee LIFESTYLE dress material mentioned in the context is a yellow summer dress. However, without more information or images, it is difficult to make a specific recommendation.


In [307]:
print("\nFashion Search Result:")
print(fashion_qa.run("Please suggest embroidery kurta?"))


Fashion Search Result:
 Based on the given context, it appears that the "W Kurta" brand offers an A-Line kurta for women with glitter print, floral embroidery, and sequin detail. This could be a potential option for an embroidery kurta. However, it is always best to do further research and compare different brands and styles before making a purchase decision.


In [None]:
# Save results
laptop_data.to_csv('processed_laptop_data.csv', index=False)
fashion_data.to_csv('processed_fashion_data.csv', index=False)
email_data.to_csv('processed_email_data.csv', index=False)


In [None]:
import pandas as pd

def analyze_data(df, query_type):
    """Perform data analytics based on query type."""
    
    # Ensure 'Price' is converted to numeric
    if 'Laptop_Price' in df.columns:
        df['Laptop_Price'] = pd.to_numeric(df['Laptop_Price'], errors='coerce')
        df = df.dropna(subset=['Laptop_Price'])

    if query_type == 'max_Laptop_Price_by_Graphics_Processor':
        if 'Graphics Processor' not in df.columns:
            return "Column 'Graphics Processor' not found in dataset."
        #df_filtered = df[df['Graphics Processor'].str.contains('Intel i7', na=False)]
        # Get the index of the highest-priced laptop for each processor type
        idx = df.groupby('Graphics Processor')['Laptop_Price'].idxmax()
        df_filtered = df.loc[idx]
        if df_filtered.empty:
            return "No laptops found with Intel i7 processor."
        #return df_filtered.nlargest(1, 'Laptop_Price')
        return  df.loc[idx]
    elif query_type == 'max_Laptop_Price_by_Core':
        if 'Core' not in df.columns:
            return "Column 'Core' not found in dataset."
        df_filtered = df[df['Core'].str.contains('i7', na=False)]
        if df_filtered.empty:
            return "No laptops found with i7."
        return df_filtered.nlargest(1, 'Price')

    elif query_type == 'mean_Fashion_price_by_Fashion_Brand':
        if 'Fashion_Brand' not in df.columns or 'Fashion_price' not in df.columns:
            return "Ensure 'Fashion_Brand' and 'Fashion_price' columns are present in the fashion dataset."
        return df.groupby('Fashion_Brand')['Fashion_price'].mean()

    elif query_type == 'summarize_emails_natural_gas':
        if 'summary' not in df.columns:
            return "Column 'summary' not found in the email dataset."
        gas_emails = df[df['summary'].str.contains('Natural Gas', case=False, na=False)]
        if gas_emails.empty:
            return "No emails found related to 'Natural Gas'."
        return email_qa.run(f"Summarize these emails: {gas_emails['summary'].tolist()[:5]}") # Limiting to 5 emails

    else:
        return "Invalid query type."

In [None]:
# Perform analytics
print(analyze_data(laptop_data, 'max_Laptop_Price_by_Graphics_Processor'))
print(analyze_data(fashion_data, 'mean_Fashion_price_by_Fashion_Brand'))
print(analyze_data(email_data, 'summarize_emails_natural_gas'))

No laptops found with Intel i7 processor.
Fashion_Brand
109F           1832.333333
20Dresses      2885.697674
250 DESIGNS    3499.000000
3PIN           1999.000000
513            2099.000000
                  ...     
trueBrowns     2999.000000
urSense        1999.000000
wild U         2390.000000
zebu           1499.000000
zink Z         2699.000000
Name: Fashion_price, Length: 1022, dtype: float64
 The email thread discusses various topics including concerns about finding a replacement for Andy Fastow at Northern Natural Gas, the impact of a natural gas deal in California on the market, confirmation of compliance with rules, a job opening at Duke Energy Field Services, and a request for proposals from natural gas suppliers by the Public Energy Authority of Kentucky.
