###***NLP Project***
### ***Automated Content Creation and Personalization for E-commerce Product Descriptions***
###***Sancia Fernandes - A012***
###***Yash Dudeja - A013***
###***Sherin Ouseph - A017***

### **Data Loading and Cleaning and saving it to another csv file**

In [None]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load datasets
amazon_data = pd.read_csv('/content/AmazonReviews.csv')  # Replace with your dataset path
flipkart_data = pd.read_csv('/content/flipkart_com-ecommerce_sample.csv')  # Replace with your dataset path

# Data Cleaning Function
def clean_text(text):
    # Ensure the text is a string (handle non-string entries)
    if not isinstance(text, str):
        text = str(text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    text = ' '.join([word for word in word_tokens if word not in stop_words])
    return text

# Apply the cleaning function to the datasets
amazon_data['cleaned_reviews'] = amazon_data['ReviewText'].apply(clean_text)
flipkart_data['cleaned_description'] = flipkart_data['description'].apply(clean_text)

# Save cleaned data for future steps
amazon_data.to_csv('cleaned_amazon_reviews.csv', index=False)
flipkart_data.to_csv('cleaned_flipkart_products.csv', index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


### **Applying sentiment pretrained mode from hugging face for the amazon data**

In [None]:
from transformers import pipeline

# Load the sentiment analysis model with truncation and padding enabled
sentiment_model = pipeline('sentiment-analysis',
                           model='distilbert-base-uncased-finetuned-sst-2-english',
                           device=0,  # Use GPU (if available)
                           truncation=True,  # Ensure input text is truncated if too long
                           padding=True)     # Ensure inputs are padded if too short

# Function to apply sentiment analysis
def get_sentiment(text):
    result = sentiment_model(text)
    return result[0]['label']

# Apply sentiment analysis to the Amazon reviews
amazon_data['sentiment'] = amazon_data['cleaned_reviews'].apply(get_sentiment)

# Save the results to a new CSV file
amazon_data.to_csv('sentiment_amazon_reviews.csv', index=False)

# Display the first few rows
print(amazon_data[['ReviewText', 'sentiment']].head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


                                          ReviewText sentiment
0  I registered on the website, tried to order a ...  NEGATIVE
1  Had multiple orders one turned up and driver h...  NEGATIVE
2  I informed these reprobates that I WOULD NOT B...  NEGATIVE
3  I have bought from Amazon before and no proble...  NEGATIVE
4  If I could give a lower rate I would! I cancel...  NEGATIVE


### **Extracting Feature from flipkart product data**

In [None]:
import re

def extract_features(description):
    """
    Extract key product features from a product description.
    This is a basic implementation and can be extended based on your needs.
    """
    features = {}

    # Example: Extract product type (e.g., 'Shirt', 'Laptop', etc.)
    product_type_match = re.search(r'\b(?:shirt|laptop|dress|phone|watch|headphone|tablet|shoes)\b', description, re.IGNORECASE)
    if product_type_match:
        features['product_type'] = product_type_match.group(0)

    # Example: Extract material (e.g., 'cotton', 'leather', 'plastic', etc.)
    material_match = re.search(r'\b(?:cotton|leather|plastic|wood|metal|silk|wool)\b', description, re.IGNORECASE)
    if material_match:
        features['material'] = material_match.group(0)

    # Example: Extract size (e.g., 'M', 'L', 'XL', 'S', etc.)
    size_match = re.search(r'\b(?:S|M|L|XL|XXL)\b', description, re.IGNORECASE)
    if size_match:
        features['size'] = size_match.group(0)

    # Example: Extract color (e.g., 'red', 'blue', 'green', etc.)
    color_match = re.search(r'\b(?:red|blue|green|black|white|yellow|pink|gray)\b', description, re.IGNORECASE)
    if color_match:
        features['color'] = color_match.group(0)

    return features


In [None]:
# Apply feature extraction to Flipkart product descriptions
flipkart_data['features'] = flipkart_data['cleaned_description'].apply(extract_features)

# Display the updated data with the 'features' column
print(flipkart_data[['product_name', 'features']].head())

                            product_name  \
0    Alisha Solid Women's Cycling Shorts   
1    FabHomeDecor Fabric Double Sofa Bed   
2                             AW Bellies   
3    Alisha Solid Women's Cycling Shorts   
4  Sicons All Purpose Arnica Dog Shampoo   

                                            features  
0             {'material': 'cotton', 'color': 'red'}  
1             {'material': 'wood', 'color': 'black'}  
2  {'product_type': 'shoes', 'material': 'leather...  
3           {'material': 'cotton', 'color': 'black'}  
4                                                 {}  


### **Applying Sentiment on flipkart data**

In [None]:
#Perform sentiment analysis on Flipkart product descriptions
flipkart_data['sentiment'] = flipkart_data['cleaned_description'].apply(get_sentiment)

# Combine sentiment and features into a single description
flipkart_data['personalized_description'] = flipkart_data.apply(
    lambda row: f"{row['sentiment']} review: {row['features']}, product type: {row['features'].get('product_type', 'Unknown')}, material: {row['features'].get('material', 'Unknown')}",
    axis=1
)

# Display the personalized descriptions
print(flipkart_data[['product_name', 'personalized_description']].head())

                            product_name  \
0    Alisha Solid Women's Cycling Shorts   
1    FabHomeDecor Fabric Double Sofa Bed   
2                             AW Bellies   
3    Alisha Solid Women's Cycling Shorts   
4  Sicons All Purpose Arnica Dog Shampoo   

                            personalized_description  
0  POSITIVE review: {'material': 'cotton', 'color...  
1  NEGATIVE review: {'material': 'wood', 'color':...  
2  NEGATIVE review: {'product_type': 'shoes', 'ma...  
3  NEGATIVE review: {'material': 'cotton', 'color...  
4  NEGATIVE review: {}, product type: Unknown, ma...  


In [None]:
flipkart_data.to_csv('flipkart_product_personalized_descriptions.csv', index=False)

# Display the updated dataset with personalized descriptions
print(flipkart_data[['product_name', 'personalized_description']].head())

                            product_name  \
0    Alisha Solid Women's Cycling Shorts   
1    FabHomeDecor Fabric Double Sofa Bed   
2                             AW Bellies   
3    Alisha Solid Women's Cycling Shorts   
4  Sicons All Purpose Arnica Dog Shampoo   

                            personalized_description  
0  POSITIVE review: {'material': 'cotton', 'color...  
1  NEGATIVE review: {'material': 'wood', 'color':...  
2  NEGATIVE review: {'product_type': 'shoes', 'ma...  
3  NEGATIVE review: {'material': 'cotton', 'color...  
4  NEGATIVE review: {}, product type: Unknown, ma...  


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer to extract unigrams and bigrams
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_features=10)

# Fit and transform the product descriptions to extract features
features = vectorizer.fit_transform(flipkart_data['cleaned_description'])

# Convert the result into a DataFrame for easy interpretation
features_df = pd.DataFrame(features.toarray(), columns=vectorizer.get_feature_names_out())

# Add the extracted features to the original flipkart data
flipkart_data = pd.concat([flipkart_data, features_df], axis=1)

# Save the updated data with extracted features
flipkart_data.to_csv('extracted_features_flipkart.csv', index=False)

# Display the extracted features
print(flipkart_data[['product_name'] + list(features_df.columns)].head())

                            product_name  buy  day  delivery  flipkartcom  \
0    Alisha Solid Women's Cycling Shorts    0    0         0            0   
1    FabHomeDecor Fabric Double Sofa Bed    0    0         2            0   
2                             AW Bellies    0    0         0            0   
3    Alisha Solid Women's Cycling Shorts    0    0         0            0   
4  Sicons All Purpose Arnica Dog Shampoo    0    0         0            0   

   free  genuine  online  products  rs  womens  
0     0        0       0         0   0       3  
1     1        0       0         2   1       0  
2     0        0       0         0   1       0  
3     0        0       0         0   0       3  
4     0        0       0         0   0       0  


In [None]:
flipkart_data.head()

Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,...,buy,day,delivery,flipkartcom,free,genuine,online,products,rs,womens
0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FF9KEDEFGF,999.0,379.0,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,...,0,0,0,0,0,0,0,0,0,3
1,7f7036a6d550aaa89d34c77bd39a5e48,2016-03-25 22:59:23 +0000,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",SBEEH3QGU7MFYJFY,32157.0,22646.0,"[""http://img6a.flixcart.com/image/sofa-bed/j/f...",False,...,0,0,2,0,1,0,0,2,1,0
2,f449ec65dcbc041b6ae5e6a32717d01b,2016-03-25 22:59:23 +0000,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",SHOEH4GRSUBJGZXE,999.0,499.0,"[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",False,...,0,0,0,0,0,0,0,0,1,0
3,0973b37acd0c664e3de26e97e5571454,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2F6HUZMQ6SJ,699.0,267.0,"[""http://img5a.flixcart.com/image/short/6/2/h/...",False,...,0,0,0,0,0,0,0,0,0,3
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,2016-03-25 22:59:23 +0000,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",PSOEH3ZYDMSYARJ5,220.0,210.0,"[""http://img5a.flixcart.com/image/pet-shampoo/...",False,...,0,0,0,0,0,0,0,0,0,0


### **Applying attention mask and padding on flipkart data using bert**

In [None]:
from transformers import AutoTokenizer, AutoModel

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Check if the tokenizer has a pad_token, otherwise add one
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token, or add your custom pad token

# Alternatively, add a custom pad token
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Update model embeddings if a new token is added
model.resize_token_embeddings(len(tokenizer))

# Example input text
texts = ["This is the first example.", "Short text.", "This is a much longer example than the others."]

# Tokenize and pad sequences
encoded_inputs = tokenizer(
    texts,
    padding=True,  # Pad to the longest sequence
    truncation=True,  # Truncate long sequences
    return_tensors="pt"  # Return PyTorch tensors
)

# Print the padded sequences
print(encoded_inputs)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

{'input_ids': tensor([[ 101, 2023, 2003, 1996, 2034, 2742, 1012,  102,    0,    0,    0,    0],
        [ 101, 2460, 3793, 1012,  102,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 2023, 2003, 1037, 2172, 2936, 2742, 2084, 1996, 2500, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


###**Training the flipkart data to generate text**

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# Load the pre-trained GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Check if the tokenizer has a pad_token and add if necessary
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the GPT-2 model and resize embeddings to include the new pad token
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))  # Resize the model embeddings to include the new pad token

# Fine-tuning parameters
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=10,
    save_steps=40_000,
    save_total_limit=2,
    logging_dir='./logs',
)

# Example training data: product descriptions and sentiment labels
# Here you use the flipkart_data DataFrame that contains the 'cleaned_description' column
train_data = flipkart_data[['cleaned_description', 'sentiment']].dropna()

# Tokenization function for text inputs
def tokenize_input(texts):
    return tokenizer(texts,
                     padding=True,          # Ensure padding to max length
                     truncation=True,       # Truncate longer sequences if necessary
                     max_length=512,        # Max length of sequence
                     return_tensors='pt')   # Return PyTorch tensors

# Create a custom dataset class
class FlipkartDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        description = self.data.iloc[idx]['cleaned_description']
        # Tokenize the description and return input_ids, attention_mask, and labels
        encoding = self.tokenizer(description,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=512,
                                  return_tensors='pt')

        input_ids = encoding['input_ids'].squeeze()  # Remove batch dimension
        attention_mask = encoding['attention_mask'].squeeze()  # Remove batch dimension

        # GPT2 uses input_ids as the labels for training
        labels = input_ids.clone()  # The labels are the same as input_ids for language modeling

        # Return as a dictionary
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Create the dataset
train_dataset = FlipkartDataset(train_data, tokenizer)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Start fine-tuning the model
trainer.train()

#Takes around 45 mins to run on GPU accelaration as the data is large
#Use wandb Key API: d5cf4ad46994dc371be6d2934fc70cc123b85552(40 character)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,1.6116
1000,0.2978
1500,0.278
2000,0.278


TrainOutput(global_step=2000, training_loss=0.6163668212890625, metrics={'train_runtime': 3039.6468, 'train_samples_per_second': 6.58, 'train_steps_per_second': 0.658, 'total_flos': 5225840640000000.0, 'train_loss': 0.6163668212890625, 'epoch': 1.0})

### **Applying TF-IDF Vectorizer for recommendation**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Assuming you already have a vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer on the 'cleaned_description' or 'description' column based on your preference
vectorizer.fit(flipkart_data['cleaned_description'])  # Use 'description' if you want raw descriptions

# Create a simple recommendation system based on sentiment and feature vectors
def recommend_products(user_input, products_df):
    # Transform user input to feature vector
    user_vector = vectorizer.transform([user_input])

    # Ensure to drop non-text columns like 'product_name', 'product_id', etc.
    product_features = products_df[['cleaned_description']]  # Use 'description' if using raw descriptions
    product_vectors = vectorizer.transform(product_features['cleaned_description'])

    # Calculate cosine similarity between user input and product features
    similarities = cosine_similarity(user_vector, product_vectors)

    # Get top 5 recommended products
    top_indices = similarities.argsort()[0, -5:][::-1]  # Sort and reverse to get the highest similarity
    recommendations = products_df.iloc[top_indices]

    return recommendations

# Example usage: Recommend products based on a sample user input
user_input = "I am looking for a red cotton t-shirt"
recommended_products = recommend_products(user_input, flipkart_data)

# Print recommended products
print(recommended_products[['product_name', 'description', 'sentiment']])


                                            product_name  \
15039  Famous By Payal Kapoor Women's Animal Print Ca...   
12520       Vanity Collection Women's Solid Casual Shirt   
16661                   Jainish Men's Solid Formal Shirt   
12393                 Seeyaar Women's Solid Casual Shirt   
12106                People Women's Printed Casual Shirt   

                                             description sentiment  
15039  Famous By Payal Kapoor Women's Animal Print Ca...  POSITIVE  
12520  Vanity Collection Women's Solid Casual Shirt\n...  POSITIVE  
16661  Jainish Men's Solid Formal Shirt\n            ...  POSITIVE  
12393  Seeyaar Women's Solid Casual Shirt\n          ...  POSITIVE  
12106  People Women's Printed Casual Shirt - Buy Red ...  NEGATIVE  


### **Developing Frontend using streamlit and deploying/Hosting through ngrok**
###**Attaching files so that you can directly run the streamlit part without running above code as the training takes 45 min to run on GPU**

In [None]:
pip install streamlit pyngrok

Collecting streamlit
  Downloading streamlit-1.40.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.1-py3-none-any.whl.metadata (8.3 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading streamlit-1.40.1-py2.py3-none-any.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.1-py3-none-any.whl (22 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64

In [None]:
%%writefile NLP.py
import streamlit as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load preprocessed datasets
flipkart_data = pd.read_csv('/content/extracted_features_flipkart.csv')
amazon_data = pd.read_csv('/content/sentiment_amazon_reviews.csv')

# Handle missing values in 'cleaned_description' by replacing NaN with an empty string
flipkart_data['cleaned_description'] = flipkart_data['cleaned_description'].fillna('')

# Inject custom CSS for styling
st.markdown("""
    <style>
    /* General settings */
    body {background-color: #f9f9f9; font-family: Arial, sans-serif;}

    /* Title and description styling */
    .main-title {color: #4CAF50; font-size: 36px; font-weight: 700; margin-top: 20px; text-align: center;}
    .description {font-size: 18px; color: #555; text-align: center; margin-bottom: 30px;}

    /* Product description section */
    .product-section {background-color: #ffffff; padding: 20px; border-radius: 10px; box-shadow: 2px 2px 15px rgba(0,0,0,0.1); margin-bottom: 30px;}
    .product-title {font-size: 24px; font-weight: bold; color: #4CAF50; margin-bottom: 5px;}
    .product-detail {font-size: 16px; color: #333; margin: 5px 0;}
    .sentiment {color: #888; font-style: italic;}

    /* Recommendation section */
    .recommendation-header {font-size: 24px; color: #4CAF50; font-weight: bold; margin-top: 20px;}
    .recommended-product {background-color: #f2f2f2; padding: 15px; margin-bottom: 20px; border-radius: 8px; box-shadow: 1px 1px 8px rgba(0,0,0,0.1);}
    .recommended-product h4 {font-size: 20px; color: #333; margin-bottom: 8px;}
    .recommended-detail {font-size: 14px; color: #555;}

    /* Input field */
    input[type="text"] {border: 1px solid #ddd; border-radius: 5px; padding: 10px; width: 100%; box-sizing: border-box;}

    /* Hover effect for recommendations */
    .recommended-product:hover {background-color: #e6f7ff; transition: background-color 0.3s ease;}
    </style>
""", unsafe_allow_html=True)

# Title and description
st.markdown('<h1 class="main-title">Personalized E-commerce Product Descriptions</h1>', unsafe_allow_html=True)
st.markdown('<p class="description">Welcome to the personalized product recommendation system! This system suggests products based on your preferences. Simply provide a description or preference, and we\'ll recommend similar products.</p>', unsafe_allow_html=True)

# Vectorizer for feature extraction
vectorizer = TfidfVectorizer()
vectorizer.fit(flipkart_data['cleaned_description'])  # Fit vectorizer on product descriptions

# Define recommendation function
def recommend_products(user_input, products_df):
    # Transform user input and product descriptions to feature vectors
    user_vector = vectorizer.transform([user_input])
    product_vectors = vectorizer.transform(products_df['cleaned_description'])

    # Calculate cosine similarity and get top 5 recommendations
    similarities = cosine_similarity(user_vector, product_vectors)
    top_indices = similarities.argsort()[0, -5:][::-1]
    return products_df.iloc[top_indices]

# Tabs for navigation
tab1, tab2, tab3 = st.tabs(["Home", "Product Section", "Recommendation Section"])

with tab1:
    st.write("### Welcome to the Home Page!")
    st.write("Explore personalized product recommendations based on your input. Use the tabs above to navigate.")

with tab2:
    st.markdown('<div class="product-section">', unsafe_allow_html=True)
    st.markdown('<h2 class="product-title">Product Descriptions</h2>', unsafe_allow_html=True)
    st.write("Choose a product from the list to view its description and sentiment.")

    product_choice = st.selectbox("Select a product", flipkart_data['product_name'].tolist())
    selected_product = flipkart_data[flipkart_data['product_name'] == product_choice]

    # Show product description and sentiment
    st.markdown(f"""
        <div class="product-detail"><strong>Product Name:</strong> {selected_product['product_name'].values[0]}</div>
        <div class="product-detail"><strong>Brand:</strong> {selected_product['brand'].values[0]}</div>
        <div class="product-detail"><strong>Description:</strong> {selected_product['cleaned_description'].values[0]}</div>
        <div class="product-detail sentiment"><strong>Sentiment:</strong> {selected_product['sentiment'].values[0]}</div>
    """, unsafe_allow_html=True)
    st.markdown('</div>', unsafe_allow_html=True)

with tab3:
    st.markdown('<h2 class="recommendation-header">Get Personalized Recommendations</h2>', unsafe_allow_html=True)
    st.write("Enter a product description or preference to receive product recommendations based on your input.")

    user_input = st.text_input("Enter your description or preference:")

    if user_input:
        recommended_products = recommend_products(user_input, flipkart_data)
        st.markdown('<h2 class="recommendation-header">Top 5 Recommended Products:</h2>', unsafe_allow_html=True)

        for idx, row in recommended_products.iterrows():
            st.markdown(f"""
                <div class="recommended-product">
                    <h4>{row['product_name']}</h4>
                    <p class="recommended-detail"><strong>Brand:</strong> {row['brand']}</p>
                    <p class="recommended-detail"><strong>Features:</strong> {row['features']}</p>
                    <p class="recommended-detail"><strong>Description:</strong> {row['cleaned_description'][:250]}...</p>
                    <p class="recommended-detail"><strong>Sentiment:</strong> {row['sentiment']}</p>
                </div>
            """, unsafe_allow_html=True)


Overwriting NLP.py


In [None]:
from pyngrok import ngrok

ngrok.set_auth_token("2mWHLqXgEZokWNmJXnk16yIL8Rc_7id3DXj9tMTMKVAobuqay")

tunnels = ngrok.get_tunnels()
print("Existing tunnels: ", tunnels)

# Close each tunnel
for tunnel in tunnels:
  ngrok.disconnect(tunnel.public_url)
# Run Streamlit in the background
!streamlit run NLP.py &>/dev/null&

# Expose the app via ngrok on port 8501
public_url = ngrok.connect(addr=8501)
print(f'Streamlit is live at: {public_url}')

Existing tunnels:  [<NgrokTunnel: "https://3e28-34-91-199-77.ngrok-free.app" -> "http://localhost:8501">]




Streamlit is live at: NgrokTunnel: "https://0067-34-91-199-77.ngrok-free.app" -> "http://localhost:8501"
