Load the dataset

In [3]:
import pandas as pd
import json

file_path = '/content/fashion_products_data.ldjson'

# Read the JSON lines file into a list of dictionaries
data = []
with open(file_path, 'r') as file:
    for line in file:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Skipping line due to error: {e}")

df = pd.DataFrame(data)


print(f"DataFrame loaded with shape: {df.shape}")


Skipping line due to error: Unterminated string starting at: line 1 column 47 (char 46)
DataFrame loaded with shape: (862, 33)


Display the columns 'asin','product_url', 'product_name','sales_price','rating','meta_keywords','medium','brand'

In [4]:
columns_to_display = ['asin', 'product_url', 'product_name', 'sales_price', 'rating', 'meta_keywords', 'medium', 'brand']
existing_columns = [col for col in columns_to_display if col in df.columns]

if existing_columns:
    print(f"\nDisplaying the first few rows of the selected columns:\n{df[existing_columns].head().to_string(index=False)}")
else:
    print("None of the specified columns are found in the dataset.")



Displaying the first few rows of the selected columns:
      asin                                                                                     product_url                                                                                 product_name sales_price rating                                                                                                                                           meta_keywords                                                                                                                                                                                                                                                                                                                                                                                                medium     brand
B07STS2W9T                  https://www.amazon.in/Facon-Kalamkari-Handblock-Dancers-Lehenga/dp/B07SWVSRPP/ LA' Facon Cotton Kalamkari Handblock Saree Blouse Fabric 100 cms B

Remove Special Character

In [9]:
import pandas as pd
import re
import json

# Replace 'fashion_products_data.ldjson' with the exact filename displayed after upload
file_path = 'fashion_products_data.ldjson'  # Modify if the filename is different

# Read the LDJSON file and parse into a DataFrame
data = []
with open(file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))
df = pd.DataFrame(data)

# Remove special characters from 'product_name' and create a cleaned version
df['cleaned_product_name'] = df['product_name'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

# Filter to show only rows where the cleaned name differs from the original
filtered_df = df[df['product_name'] != df['cleaned_product_name']]

# Display the cleaned product names for the filtered rows
print(filtered_df['cleaned_product_name'].head().to_string(index=False))


LA Facon Cotton Kalamkari Handblock Saree Blous...
 Sf Jeans By Pantaloons Mens Plain Slim fit TShirt
LOVISTA Cotton Gota Patti Tassel Traditional Pr...
            People Mens Printed Regular fit TShirt
Forest Club  Gym Wear  Sports Shorts Shorts for...


Convert Short form to Long forms

In [10]:
# Define a dictionary of common contractions and their expansions
contractions_dict = {
    "can't": "cannot", "won't": "will not", "n't": " not", "'re": " are", "'s": " is",
    "'d": " would", "'ll": " will", "'ve": " have", "'m": " am"
}

# Function to expand contractions
def expand_contractions(text):
    for contraction, full_form in contractions_dict.items():
        text = re.sub(r'\b' + re.escape(contraction) + r'\b', full_form, text)
    return text

# Path to the uploaded file
file_path = '/content/fashion_products_data.ldjson'  # Modify path if different

# Read the LDJSON file and parse into a DataFrame
data = []
with open(file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))
df = pd.DataFrame(data)

# Remove special characters from 'product_name' and expand contractions
df['cleaned_product_name'] = df['product_name'].apply(lambda x: expand_contractions(re.sub(r'[^a-zA-Z0-9\s]', '', x)))

# Filter to show only rows where the cleaned name differs from the original
filtered_df = df[df['product_name'] != df['cleaned_product_name']]

# Display the cleaned product names for the filtered rows
print(filtered_df['cleaned_product_name'].head().to_string(index=False))

LA Facon Cotton Kalamkari Handblock Saree Blous...
 Sf Jeans By Pantaloons Mens Plain Slim fit TShirt
LOVISTA Cotton Gota Patti Tassel Traditional Pr...
            People Mens Printed Regular fit TShirt
Forest Club  Gym Wear  Sports Shorts Shorts for...


Import stopwords

In [11]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Remove special characters

In [12]:
import pandas as pd
import re
import json
from nltk.corpus import stopwords

# Define stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords from a text
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Path to the uploaded file
file_path = '/content/fashion_products_data.ldjson'  # Modify path if different

# Read the LDJSON file and parse into a DataFrame
data = []
with open(file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))
df = pd.DataFrame(data)

# Remove special characters, expand contractions, and remove stopwords
def clean_product_name(text):
    # Expand contractions
    contractions_dict = {
        "can't": "cannot", "won't": "will not", "n't": " not", "'re": " are", "'s": " is",
        "'d": " would", "'ll": " will", "'ve": " have", "'m": " am"
    }
    for contraction, full_form in contractions_dict.items():
        text = re.sub(r'\b' + re.escape(contraction) + r'\b', full_form, text)

    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove stopwords
    return remove_stopwords(text)

# Apply the cleaning function
df['cleaned_product_name'] = df['product_name'].apply(clean_product_name)

# Filter to show only rows where the cleaned name differs from the original
filtered_df = df[df['product_name'] != df['cleaned_product_name']]

# Display the cleaned product names for the filtered rows
print(filtered_df['cleaned_product_name'].head().to_string(index=False))


LA Facon Cotton Kalamkari Handblock Saree Blous...
     Sf Jeans Pantaloons Men Plain Slim fit TShirt
LOVISTA Cotton Gota Patti Tassel Traditional Pr...
             People Men Printed Regular fit TShirt
Forest Club Gym Wear Sports Shorts Shorts Men S...


Remove stopwords

In [13]:
from nltk.corpus import stopwords

# Define English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords from a text
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Path to the uploaded file
file_path = '/content/fashion_products_data.ldjson'  # Adjust the path if needed

# Read the LDJSON file and parse into a DataFrame
data = []
with open(file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))
df = pd.DataFrame(data)

# Remove special characters and stopwords from 'product_name'
df['cleaned_product_name'] = df['product_name'].apply(lambda x: remove_stopwords(re.sub(r'[^a-zA-Z0-9\s]', '', x)))

# Filter to show only rows where the cleaned name differs from the original
filtered_df = df[df['product_name'] != df['cleaned_product_name']]

# Display the cleaned product names for the filtered rows
print(filtered_df['cleaned_product_name'].head().to_string(index=False))


LA Facon Cotton Kalamkari Handblock Saree Blous...
    Sf Jeans Pantaloons Mens Plain Slim fit TShirt
LOVISTA Cotton Gota Patti Tassel Traditional Pr...
            People Mens Printed Regular fit TShirt
Forest Club Gym Wear Sports Shorts Shorts Men S...


Download stopword packages

In [15]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

Stemming and lemmatization

In [16]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Define English stopwords
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Define a dictionary of common contractions and their expansions
contractions_dict = {
    "can't": "cannot", "won't": "will not", "n't": " not", "'re": " are", "'s": " is",
    "'d": " would", "'ll": " will", "'ve": " have", "'m": " am"
}

# Function to expand contractions
def expand_contractions(text):
    for contraction, full_form in contractions_dict.items():
        text = re.sub(r'\b' + re.escape(contraction) + r'\b', full_form, text)
    return text

# Function to remove stopwords, stem, and lemmatize
def process_text(text):
    # Expand contractions
    text = expand_contractions(text)

    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove stopwords
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]

    # Apply stemming and lemmatization
    stemmed = [stemmer.stem(word) for word in filtered_words]
    lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]

    return ' '.join(lemmatized)

# Path to the uploaded file
file_path = '/content/fashion_products_data.ldjson'  # Modify path if different

# Read the LDJSON file and parse into a DataFrame
data = []
with open(file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))
df = pd.DataFrame(data)

# Apply the text processing function
df['cleaned_product_name'] = df['product_name'].apply(process_text)

# Filter to show only rows where the cleaned name differs from the original
filtered_df = df[df['product_name'] != df['cleaned_product_name']]

# Display the cleaned product names for the filtered rows
print(filtered_df['cleaned_product_name'].head().to_string(index=False))

la facon cotton kalamkari handblock sare blous ...
       sf jean pantaloon men plain slim fit tshirt
lovista cotton gota patti tassel tradit print k...
                peopl men print regular fit tshirt
mont carlo grey solid cotton blend polo collar ...


In [17]:
# prompt: print all  'product_name' from dataset

print(df['product_name'].to_string(index=False))

LA' Facon Cotton Kalamkari Handblock Saree Blou...
Sf Jeans By Pantaloons Men's Plain Slim fit T-S...
LOVISTA Cotton Gota Patti Tassel Traditional Pr...
          People Men's Printed Regular fit T-Shirt
Monte Carlo Grey Solid Cotton Blend Polo Collar...
Forest Club | Gym Wear | Sports Shorts| Shorts ...
PrintOctopus Graphic Printed T-Shirt for Men Ch...
   Pepe Jeans Men's Solid Regular fit Casual Shirt
Carahere Boys Handmade Pre-Tied Classic Polka D...
                        Peppermint Synthetic Dress
Toddler Little Boy Straight Outta Timeout Long ...
            Puma Unisex Tribal Regular Fit T-Shirt
Jevi Prints Women's Cotton Printed Straight Kur...
OLLI Men's Orange, Lex Purple, White Cotton Bri...
Pinkmint Women's Multi-Coloured Digital Print C...
Miss Chase Women's Solid Shoulder Cut-Out Half-...
              bebe Women Genuine Leather Slim Belt
                   Colt by Unlimited Men's T-Shirt
                  Pepe Jeans Women's Solid T-Shirt
                 C9 Women Turqu