In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
data = pd.read_json(r'C:\Users\ravin\Downloads\fashion_products_data.ldjson', lines=True)
 


In [6]:
# Select only the specified columns from the original data
filtered_data = data[['asin', 'product_url', 'product_name', 'sales_price', 'rating', 'meta_keywords', 'medium', 'brand']]

# Display the first few rows of the selected columns
print(filtered_data.head())


         asin                                        product_url  \
0  B07STS2W9T  https://www.amazon.in/Facon-Kalamkari-Handbloc...   
1  B07N6TD2WL  https://www.amazon.in/Sf-Jeans-Pantaloons-T-Sh...   
2  B07WJ6WPN1  https://www.amazon.in/LOVISTA-Traditional-Prin...   
3  B07PYSF4WZ  https://www.amazon.in/People-Printed-Regular-T...   
4  B082KXNM7X  https://www.amazon.in/Monte-Carlo-Cotton-Colla...   

                                        product_name  sales_price  rating  \
0  LA' Facon Cotton Kalamkari Handblock Saree Blo...        200.0     5.0   
1  Sf Jeans By Pantaloons Men's Plain Slim fit T-...        265.0     3.6   
2  LOVISTA Cotton Gota Patti Tassel Traditional P...        660.0     3.5   
3           People Men's Printed Regular fit T-Shirt        195.0     3.0   
4  Monte Carlo Grey Solid Cotton Blend Polo Colla...       1914.0     5.0   

                                       meta_keywords  \
0  LA' Facon Cotton Kalamkari Handblock Saree Blo...   
1  Sf Jeans By P

In [9]:
# Remove special characters from the 'product_name' column
filtered_data.loc[:, 'product_name'] = filtered_data['product_name'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

# Display the first few rows of the updated 'product_name' column
print(filtered_data[['asin', 'product_name']].head())


         asin                                       product_name
0  B07STS2W9T  LA Facon Cotton Kalamkari Handblock Saree Blou...
1  B07N6TD2WL  Sf Jeans By Pantaloons Mens Plain Slim fit TShirt
2  B07WJ6WPN1  LOVISTA Cotton Gota Patti Tassel Traditional P...
3  B07PYSF4WZ             People Mens Printed Regular fit TShirt
4  B082KXNM7X  Monte Carlo Grey Solid Cotton Blend Polo Colla...


In [13]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Define stopwords list
stop_words = set(stopwords.words('english'))

# Function to remove stopwords from product name
def remove_stopwords(text):
    return ' '.join(word for word in text.split() if word.lower() not in stop_words)

# Apply the function to the 'product_name' column
filtered_data.loc[:, 'product_name'] = filtered_data['product_name'].apply(remove_stopwords)


# Display the first few rows of the updated 'product_name' column
print(filtered_data[['asin', 'product_name']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ravin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


         asin                                       product_name
0  B07STS2W9T  LA Facon Cotton Kalamkari Handblock Saree Blou...
1  B07N6TD2WL     Sf Jeans Pantaloons Mens Plain Slim fit TShirt
2  B07WJ6WPN1  LOVISTA Cotton Gota Patti Tassel Traditional P...
3  B07PYSF4WZ             People Mens Printed Regular fit TShirt
4  B082KXNM7X  Monte Carlo Grey Solid Cotton Blend Polo Colla...


In [15]:
import pandas as pd
import numpy as np

# 1. Convert Short Forms to Long Forms
short_form_dict = {
    'tv': 'television',
    'laptop': 'laptop computer',
    'pc': 'personal computer',
    'hd': 'high definition',
    'usb': 'universal serial bus'
    # Add more short form to long form mappings as needed
}

def replace_shortforms(text):
    for short, long in short_form_dict.items():
        text = text.replace(short, long)
    return text

# Ensure you are modifying the DataFrame with .loc to avoid warnings
filtered_data.loc[:, 'product_name'] = filtered_data['product_name'].apply(replace_shortforms)

# 2. Handle Null Values
# Option 1: Replace null values with 'Unknown'
filtered_data.loc[:, 'product_name'] = filtered_data['product_name'].fillna('Unknown')

# Option 2: Drop rows with null values (for specific columns)
# filtered_data.dropna(subset=['product_name'], inplace=True)

# 3. Remove Outliers in 'sales_price' using IQR method
Q1 = filtered_data['sales_price'].quantile(0.25)
Q3 = filtered_data['sales_price'].quantile(0.75)
IQR = Q3 - Q1

# Identify outliers
outliers_condition = (filtered_data['sales_price'] < (Q1 - 1.5 * IQR)) | (filtered_data['sales_price'] > (Q3 + 1.5 * IQR))
outliers = filtered_data[outliers_condition]

# Remove outliers
filtered_data_no_outliers = filtered_data[~outliers_condition]

# Display the first few rows to check the changes
print(filtered_data_no_outliers[['product_name', 'sales_price']].head())


                                        product_name  sales_price
0  LA Facon Cotton Kalamkari Handblock Saree Blou...        200.0
1     Sf Jeans Pantaloons Mens Plain Slim fit TShirt        265.0
2  LOVISTA Cotton Gota Patti Tassel Traditional P...        660.0
3             People Mens Printed Regular fit TShirt        195.0
5  Forest Club Gym Wear Sports Shorts Shorts Men ...        350.0


In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

# Download NLTK data (if not already downloaded)
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Define the function for stemming
def apply_stemming(text):
    words = text.split()
    return ' '.join([stemmer.stem(word) for word in words])

# Define the function for lemmatization
def apply_lemmatization(text):
    words = text.split()
    return ' '.join([lemmatizer.lemmatize(word, pos='v') for word in words])  # Using 'v' for verb

# Example of text cleaning (already done in your pipeline)
def clean_text(text):
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply the text cleaning and text normalization (stemming/lemmatization) to 'product_name' column
filtered_data.loc[:, 'product_name'] = filtered_data['product_name'].apply(lambda x: clean_text(str(x)))
# Apply stemming and lemmatization
filtered_data.loc[:, 'product_name_stemmed'] = filtered_data['product_name'].apply(apply_stemming)
filtered_data.loc[:, 'product_name_lemmatized'] = filtered_data['product_name'].apply(apply_lemmatization)
# Display the updated dataframe
print(filtered_data[['product_name', 'product_name_stemmed', 'product_name_lemmatized']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ravin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ravin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                        product_name  \
0  LA Facon Cotton Kalamkari Handblock Saree Blou...   
1     Sf Jeans Pantaloons Mens Plain Slim fit TShirt   
2  LOVISTA Cotton Gota Patti Tassel Traditional P...   
3             People Mens Printed Regular fit TShirt   
4  Monte Carlo Grey Solid Cotton Blend Polo Colla...   

                                product_name_stemmed  \
0  la facon cotton kalamkari handblock sare blous...   
1        sf jean pantaloon men plain slim fit tshirt   
2  lovista cotton gota patti tassel tradit print ...   
3                 peopl men print regular fit tshirt   
4  mont carlo grey solid cotton blend polo collar...   

                             product_name_lemmatized  
0  LA Facon Cotton Kalamkari Handblock Saree Blou...  
1     Sf Jeans Pantaloons Mens Plain Slim fit TShirt  
2  LOVISTA Cotton Gota Patti Tassel Traditional P...  
3             People Mens Printed Regular fit TShirt  
4  Monte Carlo Grey Solid Cotton Blend Polo Colla..