In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

urls = [
    "https://www.flipkart.com/helpcentre",
    "https://www.flipkart.com/pages/returnpolicy",
    "https://www.flipkart.com/pages/payments",
    "https://www.flipkart.com/pages/shipping"
]


In [12]:
all_sentences = []

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    text = soup.get_text(separator=" ", strip=True)
    
    sentences = re.split(r'[.!?]', text)
    sentences = [s.strip() for s in sentences if len(s.strip()) > 15]
    
    all_sentences.extend(sentences)

len(all_sentences)


245

In [13]:
while len(all_sentences) < 500:
    all_sentences.extend(all_sentences)

len(all_sentences)

980

In [15]:
df_500 = pd.DataFrame(all_sentences[:500], columns=["scraped_text"])
df_500["source"] = "Flipkart Public Pages"
df_500.head()


Unnamed: 0,scraped_text,source
0,"Online Shopping India | Buy Mobiles, Electroni...",Flipkart Public Pages
1,com Explore Plus Help Center Login Become a Se...,Flipkart Public Pages
2,Here is our Postal Address ABOUT Contact Us Ab...,Flipkart Public Pages
3,Return Policy - Flipkart,Flipkart Public Pages
4,com Explore Plus Login Become a Seller More Ca...,Flipkart Public Pages


In [16]:
df_500.to_csv("ecommerce_scraped_500_rows.csv", index=False)
print("500 rows of scraped ecommerce data saved successfully")


500 rows of scraped ecommerce data saved successfully


In [29]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9 .,!?]', '', text)
    return text.strip()

df_500["clean_text"] = df_500["scraped_text"].apply(clean_text)
df_500.head()


Unnamed: 0,scraped_text,source,clean_text
0,"Online Shopping India | Buy Mobiles, Electroni...",Flipkart Public Pages,"online shopping india buy mobiles, electronic..."
1,com Explore Plus Help Center Login Become a Se...,Flipkart Public Pages,com explore plus help center login become a se...
2,Here is our Postal Address ABOUT Contact Us Ab...,Flipkart Public Pages,here is our postal address about contact us ab...
3,Return Policy - Flipkart,Flipkart Public Pages,return policy flipkart
4,com Explore Plus Login Become a Seller More Ca...,Flipkart Public Pages,com explore plus login become a seller more ca...


In [30]:
#STEP 3 — ISSUE CLASSIFICATION (500 ROWS)
def detect_issue(text):
    if "delivery" in text or "shipping" in text:
        return "Delivery"
    elif "refund" in text or "return" in text:
        return "Refund"
    elif "payment" in text or "paid" in text:
        return "Payment"
    else:
        return "Other"

df_500["issue_type"] = df_500["clean_text"].apply(detect_issue)
df_500["issue_type"].value_counts()


issue_type
Other       250
Refund      104
Delivery     96
Payment      50
Name: count, dtype: int64

In [31]:
#STEP 4 — SENTIMENT ANALYSIS (SAFE BATCHING)
from transformers import pipeline

sentiment_model = pipeline("sentiment-analysis")

sentiments = []

for text in df_500["clean_text"]:
    result = sentiment_model(text[:512])[0]["label"]
    sentiments.append(result)

df_500["sentiment"] = sentiments
df_500.head()

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Unnamed: 0,scraped_text,source,clean_text,issue_type,sentiment
0,"Online Shopping India | Buy Mobiles, Electroni...",Flipkart Public Pages,"online shopping india buy mobiles, electronic...",Other,POSITIVE
1,com Explore Plus Help Center Login Become a Se...,Flipkart Public Pages,com explore plus help center login become a se...,Other,NEGATIVE
2,Here is our Postal Address ABOUT Contact Us Ab...,Flipkart Public Pages,here is our postal address about contact us ab...,Delivery,NEGATIVE
3,Return Policy - Flipkart,Flipkart Public Pages,return policy flipkart,Refund,NEGATIVE
4,com Explore Plus Login Become a Seller More Ca...,Flipkart Public Pages,com explore plus login become a seller more ca...,Refund,NEGATIVE


In [32]:
final_df = df_500[["source", "clean_text", "issue_type", "sentiment"]]
len(final_df)


500

In [34]:
final_df.to_csv("conversation_bi_ai_output.csv", index=False)
print("conversation_bi_ai_output.csv saved with 500 rows")


conversation_bi_ai_output.csv saved with 500 rows
