In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json



In [9]:
data = []
with open("/fashion_products_data.json", encoding="utf-8") as f:
    for line in f:
        try:
            data.append(json.loads(line.strip()))
        except json.JSONDecodeError as e:
            print(f"Skipping line due to error: {e}")

df = pd.DataFrame(data)

Skipping line due to error: Unterminated string starting at: line 1 column 1139 (char 1138)


In [10]:
print(df.head())


                            uniq_id            crawl_timestamp        asin  \
0  26d41bdc1495de290bc8e6062d927729  2020-02-07 05:11:36 +0000  B07STS2W9T   
1  410c62298852e68f34c35560f2311e5a  2020-02-07 08:45:56 +0000  B07N6TD2WL   
2  52e31bb31680b0ec73de0d781a23cc0a  2020-02-06 11:09:38 +0000  B07WJ6WPN1   
3  25798d6dc43239c118452d1bee0fb088  2020-02-07 08:32:45 +0000  B07PYSF4WZ   
4  ad8a5a196d515ef09dfdaf082bdc37c4  2020-02-06 14:27:48 +0000  B082KXNM7X   

                                         product_url  \
0  https://www.amazon.in/Facon-Kalamkari-Handbloc...   
1  https://www.amazon.in/Sf-Jeans-Pantaloons-T-Sh...   
2  https://www.amazon.in/LOVISTA-Traditional-Prin...   
3  https://www.amazon.in/People-Printed-Regular-T...   
4  https://www.amazon.in/Monte-Carlo-Cotton-Colla...   

                                        product_name  \
0  LA' Facon Cotton Kalamkari Handblock Saree Blo...   
1  Sf Jeans By Pantaloons Men's Plain Slim fit T-...   
2  LOVISTA Cotton Gota Pat

In [11]:
# Show the first 5 rows
print(df.head())

# Check data types and non-null count
print(df.info())

# Check for missing values
print(df.isnull().sum())


                            uniq_id            crawl_timestamp        asin  \
0  26d41bdc1495de290bc8e6062d927729  2020-02-07 05:11:36 +0000  B07STS2W9T   
1  410c62298852e68f34c35560f2311e5a  2020-02-07 08:45:56 +0000  B07N6TD2WL   
2  52e31bb31680b0ec73de0d781a23cc0a  2020-02-06 11:09:38 +0000  B07WJ6WPN1   
3  25798d6dc43239c118452d1bee0fb088  2020-02-07 08:32:45 +0000  B07PYSF4WZ   
4  ad8a5a196d515ef09dfdaf082bdc37c4  2020-02-06 14:27:48 +0000  B082KXNM7X   

                                         product_url  \
0  https://www.amazon.in/Facon-Kalamkari-Handbloc...   
1  https://www.amazon.in/Sf-Jeans-Pantaloons-T-Sh...   
2  https://www.amazon.in/LOVISTA-Traditional-Prin...   
3  https://www.amazon.in/People-Printed-Regular-T...   
4  https://www.amazon.in/Monte-Carlo-Cotton-Colla...   

                                        product_name  \
0  LA' Facon Cotton Kalamkari Handblock Saree Blo...   
1  Sf Jeans By Pantaloons Men's Plain Slim fit T-...   
2  LOVISTA Cotton Gota Pat

In [13]:
# Selecting only the required columns
df_selected = df[['asin', 'product_url', 'product_name', 'sales_price', 'rating', 'meta_keywords', 'medium', 'brand']]


In [14]:
print(df_selected.head())


         asin                                        product_url  \
0  B07STS2W9T  https://www.amazon.in/Facon-Kalamkari-Handbloc...   
1  B07N6TD2WL  https://www.amazon.in/Sf-Jeans-Pantaloons-T-Sh...   
2  B07WJ6WPN1  https://www.amazon.in/LOVISTA-Traditional-Prin...   
3  B07PYSF4WZ  https://www.amazon.in/People-Printed-Regular-T...   
4  B082KXNM7X  https://www.amazon.in/Monte-Carlo-Cotton-Colla...   

                                        product_name sales_price rating  \
0  LA' Facon Cotton Kalamkari Handblock Saree Blo...      200.00    5.0   
1  Sf Jeans By Pantaloons Men's Plain Slim fit T-...      265.00    3.6   
2  LOVISTA Cotton Gota Patti Tassel Traditional P...      660.00    3.5   
3           People Men's Printed Regular fit T-Shirt      195.00    3.0   
4  Monte Carlo Grey Solid Cotton Blend Polo Colla...     1914.00    5.0   

                                       meta_keywords  \
0  LA' Facon Cotton Kalamkari Handblock Saree Blo...   
1  Sf Jeans By Pantaloons Me

In [16]:
import re

# Removing special characters from the product_name column
df_selected['product_name'] = df_selected['product_name'].str.replace(r'[^A-Za-z0-9\s]+', '', regex=True)
print(df_selected[['product_name']].head())


                                        product_name
0  LA Facon Cotton Kalamkari Handblock Saree Blou...
1  Sf Jeans By Pantaloons Mens Plain Slim fit TShirt
2  LOVISTA Cotton Gota Patti Tassel Traditional P...
3             People Mens Printed Regular fit TShirt
4  Monte Carlo Grey Solid Cotton Blend Polo Colla...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['product_name'] = df_selected['product_name'].str.replace(r'[^A-Za-z0-9\s]+', '', regex=True)


In [17]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# Define the list of stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from product_name
df_selected['product_name'] = df_selected['product_name'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
print(df_selected[['product_name']].head())


                                        product_name
0  LA Facon Cotton Kalamkari Handblock Saree Blou...
1     Sf Jeans Pantaloons Mens Plain Slim fit TShirt
2  LOVISTA Cotton Gota Patti Tassel Traditional P...
3             People Mens Printed Regular fit TShirt
4  Monte Carlo Grey Solid Cotton Blend Polo Colla...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['product_name'] = df_selected['product_name'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))


In [18]:
import nltk
nltk.download('words')
from nltk.corpus import words

# Define the set of English words
english_words = set(words.words())

# Remove non-English words from product_name
df_selected['product_name'] = df_selected['product_name'].apply(
    lambda x: ' '.join([word for word in x.split() if word.lower() in english_words])
)
print(df_selected[['product_name']].head())



[nltk_data] Downloading package words to /root/nltk_data...


                                product_name
0  LA Cotton Blouse Fabric Black Base Cotton
1            Jeans Pantaloons Plain Slim fit
2     Cotton Tassel Traditional Printed Suit
3                 People Printed Regular fit
4  Monte Grey Solid Cotton Blend Polo Collar


[nltk_data]   Unzipping corpora/words.zip.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['product_name'] = df_selected['product_name'].apply(


In [None]:
#convert short form to long form

In [20]:
import nltk
from nltk.stem import PorterStemmer

# Initialize the stemmer
stemmer = PorterStemmer()
from nltk.stem import LancasterStemmer

# Initialize the stemmer
stemmer = LancasterStemmer()

# Apply stemming to each word in the product_name column
df_selected['product_name'] = df_selected['product_name'].apply(
    lambda x: ' '.join([stemmer.stem(word) for word in x.split()])
)
print(df_selected[['product_name']].head())


                             product_name
0   la cotton blous fabr black bas cotton
1           jean pantaloon plain slim fit
2         cotton tassel tradit print suit
3                   peopl print regul fit
4  mont grey solid cotton blend polo coll


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['product_name'] = df_selected['product_name'].apply(
