In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# I tried NLP- Sentiment Analysis by Vader, clustering and PCA to find the segments.

In [None]:
import warnings
warnings.filterwarnings('ignore')

# 0. Preprocessing

In [None]:
df=pd.read_csv('/kaggle/input/men-women-shoes-reviews/Shoes_Data.csv')

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df['rating']=df['rating'].apply(lambda x: str(x).replace(' out of 5 stars','') if ' out of 5 stars' in str(x) else str(x))
df['price']=df['price'].apply(lambda x: str(x).replace('₹','') if '₹' in str(x) else str(x))
df['total_reviews']=df['total_reviews'].apply(lambda x: str(x).replace(' ratings','') if ' ratings' in str(x) else str(x))
df['total_reviews']=df['total_reviews'].apply(lambda x: str(x).replace(' rating','') if ' rating' in str(x) else str(x))
df['rating']=df['rating'].astype(float)
df['price']=df['price'].astype(float)
df['total_reviews']=df['total_reviews'].astype(int)

# 1. Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.heatmap(df.corr(),annot=True)

In [None]:
plt.figure(figsize=(10, 5))
sns.distplot(df[df["Shoe Type"]=="Men"].price,kde=False,rug=False)
sns.distplot(df[df["Shoe Type"]=="Women"].price,kde=False,rug=False)

plt.legend(labels=['Men', 'Women'])
plt.show()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
sns.boxplot(x='Shoe Type', y='price', data=df, showfliers=False, ax=ax)
sns.stripplot(x='Shoe Type', y='price', data=df, jitter=True, color='black', ax=ax)

plt.show()

In [None]:
plt.figure(figsize=(12, 12))
plt.legend(fontsize=10)
plt.tick_params(labelsize=10)
ax=sns.scatterplot(x=df['rating'],y=df['price'],hue=df['Shoe Type'],size=df['total_reviews'],data=df,sizes=(50,500))
plt.xticks(rotation=0)
ax.legend(loc='upper left',bbox_to_anchor=(1,1))

# 2. NLP- Sentiment Analysis by Vader.

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import string

list_stopwords = set(stopwords.words('english'))
df['reviews2'] = df['reviews'].str.lower()
df['reviews2'] = df['reviews2'].apply(word_tokenize)
df['reviews2'] = df['reviews2'].apply(lambda x: [word for word in x if word not in list_stopwords])
df['reviews2'] = df['reviews2'].apply(lambda x : [word.translate(str.maketrans('', '', string.punctuation)) for word in x])
df['reviews2'] = df['reviews2'].apply(lambda x : [word for word in x if len(word) > 1])

In [None]:
df1=pd.DataFrame(df['reviews2'].explode())

In [None]:
pd.set_option('display.max_rows',50)
df1.groupby('reviews2')['reviews2'].count().sort_values(ascending=False).head(50).plot.bar(figsize=(10,5))

# I divided rating into three, 'over 3 '=1, '3'=0 and 'under 3' = -1 as Rpolarity.

In [None]:
def rating_judge(ex):
    if ex <3 :
        return -1
    elif ex>3 :
        return 1
    else:
        return 0

In [None]:
df.loc[:,'Rpolarity']=df.loc[:,'rating'].apply(rating_judge)
df

In [None]:
df.groupby('Rpolarity')['title'].count().plot.bar()

# Most people rate 'Positive'(over 3)  in rating.

# Next, I tried Vader for sentiment analysis.

In [None]:
pip install vaderSentiment

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
def rounder(num):
    return round(num)
    if num > 0: return 1
    if num < 0: return -1
    return 0

In [None]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
Vpol = []

for text in df['reviews']:
    Vpol.append(rounder(analyzer.polarity_scores(text)['compound']))

df['VPolarity'] = Vpol 

vnotagree = df[df['Rpolarity']!=df['VPolarity']]
vagree = df[df['Rpolarity']==df['VPolarity']]

print(f"Overall length {len(df)} ")
print(f"VADER agreements/disagreements {len(vagree)}/{len(vnotagree)}")
print(f"Accuracy: {len(vagree)/len(df)*100}% ")

In [None]:
df.head()

# I got 70% accuracy between 'Rpolarity' and 'VPolarity'. It may be that there are some negative factors in Rpolarity '1'.

# So I tried to use both 'VPolarity' and 'Rpolarity' for clustering with 'price', 'rating' and 'total_reviews.

# 4. Clustering and PCA by PyCaret

In [None]:
df=df.replace({'Men':1, 'Women': 0})

In [None]:
df_ana=df.loc[:,['price','rating','total_reviews','Shoe Type','Rpolarity','VPolarity']]

In [None]:
pip install pycaret

In [None]:
from pycaret.clustering import *
data_clust = setup(df_ana, normalize = True,session_id = 123,silent=True)

In [None]:
kmeans = create_model('kmeans',num_clusters = 4 )

In [None]:
kmean_results = assign_model(kmeans)
kmean_results.head()

In [None]:
kmean_results.groupby('Cluster').count()

In [None]:
kmean_results.groupby('Cluster').mean()

In [None]:
plot_model(kmeans)

In [None]:
plot_model(kmeans, plot = 'silhouette')

In [None]:
plot_model(kmeans, plot = 'distribution', feature = 'price')