<a href="https://colab.research.google.com/github/sjtae/data_science_project/blob/main/Amazon_Sales_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Amazon Sales Analysis


In [251]:
# Necessary libraries
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [252]:
df = pd.read_csv('amazon.csv')
df = pd.DataFrame(df)

## Data Preprocessing

### Entangle the category column

In [253]:
def split_category(category):
    categories = category.split('|')
    root_category = categories[0]
    sub_category = categories[-1] if len(categories) > 1 else None
    return root_category, sub_category

# Apply the function and create new columns
df[['category', 'sub-category']] = df['category'].apply(split_category).apply(pd.Series)

In [254]:
df[['category','sub-category']]

Unnamed: 0,category,sub-category
0,Computers&Accessories,USBCables
1,Computers&Accessories,USBCables
2,Computers&Accessories,USBCables
3,Computers&Accessories,USBCables
4,Computers&Accessories,USBCables
...,...,...
1460,Home&Kitchen,WaterPurifierAccessories
1461,Home&Kitchen,Rice&PastaCookers
1462,Home&Kitchen,HeatConvectors
1463,Home&Kitchen,ExhaustFans


### Clean the text of product name, product description and reviews


In [255]:
def clean_product_name(product_name):
    # Convert to lowercase
    product_name = product_name.lower()

    # Remove special characters and punctuation
    product_name = re.sub(r'[^a-zA-Z0-9\s]', '', product_name)

    # Remove extra whitespaces
    product_name = ' '.join(product_name.split())

    return product_name

df['product_name'] = df['product_name'].apply(clean_product_name)


In [256]:
def clean_product_description(about_product):
    # Remove special characters and symbols
    about_product = re.sub(r'[^a-zA-Z0-9\s]', '', about_product)

    # Replace multiple spaces with a single space
    about_product = re.sub(r'\s+', ' ', about_product)

    return about_product

df['about_product'] = df['about_product'].apply(clean_product_description)

In [257]:
def clean_review_text(review_content):
    # Remove HTML tags
    review_content = BeautifulSoup(str(review_content), "html.parser").get_text()

    # Convert text to lowercase
    review_content = review_content.lower()

    # Remove special characters, numbers, and punctuation
    review_content = re.sub(r'[^a-zA-Z\s]', '', review_content)

    # Remove extra whitespace and line breaks
    review_content = ' '.join(review_content.split())

    return review_content

df['review_content'] = df['review_content'].apply(clean_review_text)

  review_content = BeautifulSoup(str(review_content), "html.parser").get_text()


In [258]:
df.head()

Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,user_id,user_name,review_id,review_title,review_content,img_link,product_link,sub-category
0,B07JW9H4J1,wayona nylon braided usb to lightning fast cha...,Computers&Accessories,₹399,"₹1,099",64%,4.2,24269,High Compatibility Compatible With iPhone 12 1...,"AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBB...","Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jasp...","R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",looks durable charging is fine toono complains...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...,USBCables
1,B098NS6PVG,ambrane unbreakable 60w 3a fast charging 15m b...,Computers&Accessories,₹199,₹349,43%,4.0,43994,Compatible with all Type C enabled devices be ...,"AECPFYFQVRUWC3KGNLJIOREFP5LQ,AGYYVPDD7YG7FYNBX...","ArdKn,Nirbhay kumar,Sagar Viswanathan,Asp,Plac...","RGIQEG07R9HS2,R1SMWZQ86XIN8U,R2J3Y1WL29GWDE,RY...","A Good Braided Cable for Your Type C Device,Go...",i ordered this cable to connect my phone to an...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Ambrane-Unbreakable-Char...,USBCables
2,B096MSW6CT,sounce fast phone charging cable data sync usb...,Computers&Accessories,₹199,"₹1,899",90%,3.9,7928,Fast Charger Data SyncWith builtin safety pro...,"AGU3BBQ2V2DDAMOAKGFAWDDQ6QHA,AESFLDV2PT363T2AQ...","Kunal,Himanshu,viswanath,sai niharka,saqib mal...","R3J3EQQ9TZI5ZJ,R3E7WBGK7ID0KV,RWU79XKQ6I1QF,R2...","Good speed for earlier versions,Good Product,W...",not quite durable and sturdyhttpsmmediaamazonc...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Sounce-iPhone-Charging-C...,USBCables
3,B08HDJ86NZ,boat deuce usb 300 2 in 1 typec micro usb stre...,Computers&Accessories,₹329,₹699,53%,4.2,94363,The boAt Deuce USB 300 2 in 1 cable is compati...,"AEWAZDZZJLQUYVOVGBEUKSLXHQ5A,AG5HTSFRRE6NL3M5S...","Omkar dhale,JD,HEMALATHA,Ajwadh a.,amar singh ...","R3EEUZKKK9J36I,R3HJVYCLYOY554,REDECAZ7AMPQC,R1...","Good product,Good one,Nice,Really nice product...",good productlong wirecharges goodnicei bought ...,https://m.media-amazon.com/images/I/41V5FtEWPk...,https://www.amazon.in/Deuce-300-Resistant-Tang...,USBCables
4,B08CF3B7N1,portronics konnect l 12m fast charging 3a 8 pi...,Computers&Accessories,₹154,₹399,61%,4.2,16905,CHARGE SYNC FUNCTION This cable comes with cha...,"AE3Q6KSUK5P75D5HFYHCRAOLODSA,AFUGIFH5ZAFXRDSZH...","rahuls6099,Swasat Borah,Ajay Wadke,Pranali,RVK...","R1BP4L2HH9TFUP,R16PVJEXKV6QZS,R2UPDB81N66T4P,R...","As good as original,Decent,Good one for second...",bought this instead of original apple does the...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Portronics-Konnect-POR-1...,USBCables


## Category Analysis

### Descriptive statistics analysis

In [259]:
# Descriptive statistics for sales across categories

def price_convert(price_str):
    # Remove currency symbols and commas
    price_str = price_str.replace('₹', '').replace(',', '')
    # Convert to float (assuming prices have decimal points)
    return float(price_str)

# Clean and convert the 'Price' column
df['discounted_price'] = df['discounted_price'].apply(price_convert)

# Descriptive statistics
grouped_statistics = df.groupby('category')['discounted_price'].describe()
print(grouped_statistics)

                       count         mean           std     min     25%  \
category                                                                  
Car&Motorbike            1.0  2339.000000           NaN  2339.0  2339.0   
Computers&Accessories  453.0   842.650375   2383.115593    39.0   199.0   
Electronics            526.0  5965.887833  10279.670967    79.0   399.0   
Health&PersonalCare      1.0   899.000000           NaN   899.0   899.0   
Home&Kitchen           448.0  2330.615647   3467.403842    79.0   632.5   
HomeImprovement          2.0   337.000000    124.450793   249.0   293.0   
MusicalInstruments       2.0   638.000000    226.274170   478.0   558.0   
OfficeProducts          31.0   301.580645    317.288699    50.0   117.0   
Toys&Games               1.0   150.000000           NaN   150.0   150.0   

                          50%      75%      max  
category                                         
Car&Motorbike          2339.0  2339.00   2339.0  
Computers&Accessories   

In [260]:
# Descriptive statistics for ratings across categories

# Clean the 'rating' column by removing non-numeric characters and empty string
df['rating'] = df['rating'].str.replace('[^0-9.]', '', regex=True)
df['rating'] = df['rating'].replace('', np.nan)

# Convert the 'rating' column to float
df['rating'] = df['rating'].astype(float)

# Calculate descriptive statistics
grouped_statistics = df.groupby('category')['rating'].describe()
print(grouped_statistics)

                       count      mean       std  min    25%   50%    75%  max
category                                                                      
Car&Motorbike            1.0  3.800000       NaN  3.8  3.800  3.80  3.800  3.8
Computers&Accessories  453.0  4.154967  0.261045  3.0  4.000  4.20  4.300  5.0
Electronics            526.0  4.081749  0.269620  2.8  3.900  4.10  4.300  4.7
Health&PersonalCare      1.0  4.000000       NaN  4.0  4.000  4.00  4.000  4.0
Home&Kitchen           447.0  4.040716  0.334687  2.0  3.900  4.10  4.200  4.8
HomeImprovement          2.0  4.250000  0.353553  4.0  4.125  4.25  4.375  4.5
MusicalInstruments       2.0  3.900000  0.141421  3.8  3.850  3.90  3.950  4.0
OfficeProducts          31.0  4.309677  0.149119  4.0  4.200  4.30  4.400  4.5
Toys&Games               1.0  4.300000       NaN  4.3  4.300  4.30  4.300  4.3


In [261]:
# Descriptive statistics for reviews across categories

grouped_statistics = df.groupby('category')['review_content'].describe()
print(grouped_statistics)

                      count unique  \
category                             
Car&Motorbike             1      1   
Computers&Accessories   453    331   
Electronics             526    391   
Health&PersonalCare       1      1   
Home&Kitchen            448    448   
HomeImprovement           2      2   
MusicalInstruments        2      2   
OfficeProducts           31     31   
Toys&Games                1      1   

                                                                     top freq  
category                                                                       
Car&Motorbike          as of now its working fine but dont expect mor...    1  
Computers&Accessories  good productlong wirecharges goodnicei bought ...    7  
Electronics            i am not big on camera usage personally i was ...    8  
Health&PersonalCare    this is aesthetically the most appealing digit...    1  
Home&Kitchen           quality of adhesive is very good i used it to ...    1  
HomeImprovement      

### Discount Analysis

In [262]:
# Analyze the number of discounts across categories

# Clean and convert the 'actual_price' column
df['actual_price'] = df['actual_price'].apply(price_convert)

# Aggregate sum, mean for discounts per category
df['discount'] = df['actual_price'] - df['discounted_price']
total_discount = df.groupby('category')['discount'].agg(["sum","mean"])
print(total_discount)

                              sum         mean
category                                      
Car&Motorbike             1661.00  1661.000000
Computers&Accessories   380960.66   840.972759
Electronics            2188909.00  4161.423954
Health&PersonalCare       1001.00  1001.000000
Home&Kitchen            820493.19  1831.458013
HomeImprovement            924.00   462.000000
MusicalInstruments        1418.00   709.000000
OfficeProducts            2964.00    95.612903
Toys&Games                   0.00     0.000000


In [263]:
# Analyze the average and median discount percentage per category

def discount_clean(discount_str):
    discount_str = discount_str.replace('%', '')
    return int(discount_str)

# Clean the 'discount_percentage' column
df['discount_percentage'] = df['discount_percentage'].apply(discount_clean)

# Calculate mean and median for discount percentage
discount_percent = df.groupby('category')['discount_percentage'].agg(['mean','median'])
print(discount_percent)

                            mean  median
category                                
Car&Motorbike          42.000000    42.0
Computers&Accessories  54.024283    58.0
Electronics            50.828897    54.0
Health&PersonalCare    53.000000    53.0
Home&Kitchen           40.120536    41.5
HomeImprovement        57.500000    57.5
MusicalInstruments     46.000000    46.0
OfficeProducts         12.354839     5.0
Toys&Games              0.000000     0.0


## Review Sentiment Analysis

In [264]:
# Download pre-trained sentiment analysis lexicon
nltk.download('vader_lexicon')

# Initialize the Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Define a function to assign sentiment labels
def get_sentiment(review):
    sentiment_score = sia.polarity_scores(review)
    if sentiment_score['compound'] >= 0.05:
        return 'positive'
    elif sentiment_score['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply the function to the 'review_content' column
df['sentiment'] = df['review_content'].apply(get_sentiment)


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [265]:
# Encode 'sentiment' as binary
sentiment_mapping = {'negative': 0, 'positive': 1, 'neutral': 2}
df['sentiment_encoded'] = df['sentiment'].map(sentiment_mapping)

# Handle missing values (NaN)
df.dropna(subset=['rating', 'sentiment_encoded'], inplace=True)

# Calculate the correlation matrix
correlation_matrix = df[['rating', 'sentiment_encoded']].corr()

# Print the correlation between 'rating' and 'sentiment_encoded'
print(correlation_matrix)

                     rating  sentiment_encoded
rating             1.000000           0.111888
sentiment_encoded  0.111888           1.000000


#### The analysis of the relationship between sentiment and product rating in our dataset reveals a weak positive correlation (correlation coefficient of 0.111). This means that, on average, higher product ratings tend to be associated with slightly more positive sentiment. However, the relationship is not strong, indicating that factors beyond rating significantly influence sentiment, and that rating alone is not a reliable predictor of sentiment in our dataset.