# Adidas vs Nike Cluster Analysis

## 1.EDA
## 2.聚类(Clustering)
* 特征标准化----StandardScaler
* PCA降维-------PCA dimension reduction
* KMeans
* 关键字挖掘----cluster keyword digging

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## 1.EDA

In [None]:
pd.set_option('display.max_colwidth', None)
df = pd.read_csv('/kaggle/input/adidas-vs-nike/Adidas Vs Nike.csv')
df.head()

In [None]:
df.info()

In [None]:
df.Brand.unique()

'Adidas Adidas ORIGINALS' 和 'Adidas ORIGINALS'出现重复，保留其一

In [None]:
df['Brand'] = df['Brand'].str.replace('Adidas','')
df['Brand'] = df['Brand'].str.replace(' ','')

# 将品牌与描述合并，便于后续分析文本词频
df['Description'] = df['Description'].astype(str) + ' '+ df['Brand'].astype(str)
df.head()

In [None]:
df.Brand.unique()

In [None]:
df.isnull().sum()

In [None]:
# 删除聚类不需要用到的特征
df1 = df.loc[:,'Listing Price':'Reviews']
df1.head()

查看一下各品牌的销量

In [None]:
plt.gca().spines['right'].set_color('none')
plt.gca().spines['top'].set_color('none')
df1.Brand.value_counts().plot(kind='barh')
for y, x in enumerate(df1.Brand.value_counts().values):
    plt.text(x-100, y, "%s" %x,c='white',weight='bold')

显然Adidas Core/Neo的销量是最高的，再来查看一下各品牌'Listing Price'、'Sale Price'、'Discount'、'Rating'、'Reviews'的均值

In [None]:
for i in ['Listing Price','Sale Price','Discount','Rating','Reviews']:
    print(i,':')
    plt.figure(figsize=(10,4),frameon=False)
    plt.gca().spines['right'].set_color('none')
    plt.gca().spines['top'].set_color('none')
    df1.groupby('Brand')[i].mean().plot(kind='barh')
    for y, x in enumerate(df1.groupby('Brand')[i].mean().values):
        plt.text(0.75*x, y, "%.1f" %x,c='white',weight='bold')
    plt.show()

* 在Adidas的子品牌中，Originals的订单均价最高， Core/Neo的订单均价最低
* 在Adidas的子品牌中，Originals的平均折扣力度最小，Core/Neo的平均折扣力度最大

Nike的平均折扣为0，并且Listing Price、Rating、Reviews都偏低，需要查看一下具体的数据

In [None]:
df[df['Brand']=='Nike']

In [None]:
df[df['Listing Price']==0]

In [None]:
df[df['Listing Price']==0].Brand.unique()

WTF ？？ 订单价格为0的情况只存在于Nike品牌，且Nike 66%的订单价格都为0，背后的原因无从得知

接下来我们查看一下各品牌销量最高的产品是哪些：

In [None]:
for i in df.Brand.unique():
    print(i,':')
    plt.figure(figsize=(10,4),frameon=False)
    plt.gca().spines['right'].set_color('none')
    plt.gca().spines['top'].set_color('none')
    temp = df[df.Brand==i].groupby('Product Name')['Listing Price'].count().sort_values(ascending=False).head(5).sort_values()
    temp.plot(kind='barh')
    for y, x in enumerate(temp.values):
        plt.text(0.95*x, y, "%s" %x,c='white',weight='bold')
    plt.show()

查看各个特征之间的相关性：

In [None]:
plt.figure(figsize=(14,10))
sns.heatmap(df.corr(method='spearman'),annot=True,mask=(df.corr()**2<0.04))

值得一提的是，折扣力度与评论数之间存在较弱的正相关性

## 2.聚类
### 特征标准化

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score

df_clu = df1[['Listing Price','Sale Price','Discount','Rating','Reviews']]

# 特征值标准化

scaler =StandardScaler()
df_scaled = scaler.fit_transform(df_clu)

### PCA主成分分析降维

In [None]:
pca =PCA()
pca.fit(df_scaled)

# 查看方差确定主成分数量
features = range(pca.n_components_)
plt.bar(features,pca.explained_variance_)
plt.xlabel('PCA features')
plt.ylabel('Variance')

可以看到特征4的方差较低，可以认为有4个主成分

In [None]:
pca = PCA(n_components=4)
df_4d = pca.fit_transform(df_scaled)

接下来通过elbow法和轮廓系数确定KMeans聚类的类别数

### KMeans

In [None]:
num_clusters = range(2,11)
inertias =[]
sil_scores = []
for k in num_clusters:
    model = KMeans(n_clusters=k)
    model.fit(df_4d)
    inertias.append(model.inertia_)
    sil_scores.append(silhouette_score(df_4d,model.labels_))
plt.plot(num_clusters,inertias,'-o')

In [None]:
plt.plot(num_clusters,sil_scores,'-o')

结合KMeans惯性图和轮廓系数来看，将数据分为4类比较合适

In [None]:
kmeans = KMeans(n_clusters=4,random_state=2)
kmeans.fit(df_4d)
from collections import Counter
Counter(kmeans.labels_)

经过降维的数据仍然有4个维度，可以通过Tsne将数据再次降维后可视化

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(learning_rate=700,random_state=2,perplexity=200)
tsne_features = tsne.fit_transform(df_4d)
plt.figure(figsize=(14,10))
plt.scatter(tsne_features[:,0],tsne_features[:,1],c=kmeans.labels_)
plt.show()

In [None]:
# 将KMeans计算出的结果添加到表中
df1['Labels'] = kmeans.labels_
df1

查看一下各类别分别包含哪些品牌

In [None]:
sns.set_palette('Paired',4)
sns.catplot(data=df1,kind='count',x='Brand',col='Labels',col_wrap=2)

* 0类中没有 Adidas Core/Neo
* 3类的主要品牌是Nike
* 2类中Adidas（子品牌）数量明显高于其他三类，且3类中没有Nike

In [None]:
for i in ['Listing Price','Sale Price','Discount','Rating','Reviews']:
    sns.catplot(data=df1,kind='strip',col='Brand',col_wrap=4,y=i,x='Labels',height=3,aspect=0.9,jitter=0.1)

* 0类的订单价格最高，且不包含Adidas Core/Neo品牌
* 2类的折扣力度最大，折扣价最低
* NIke由于没有折扣，所以折扣价最高
* 3类的评分较低

### 关键字挖掘

In [None]:
from nltk import word_tokenize
import re

# 过滤不重要的词
stop_words = ['of','an','a','are','is','with','the','adidas','on','in','this','by'
              ,'to','and','as','for','have','has','at','in','its','these','it','you','your'
               ,'that','look','shoe','shoes','outsole','midsole','feel','feet','every','from'
              ,'they','while','upper','style','foot','provides','nike','originals','coreneo'
              ,'sportperformance','comfortable','run','new']

def remove_noise(text,stop_words = stop_words):
    words = word_tokenize(text)
    cleaned_words = []
    for word in words:
        word = re.sub('\W','',word)
        if len(word) > 1 and word.lower() not in stop_words:
            cleaned_words.append(word.lower())
    return cleaned_words

In [None]:
# 计算词频得出关键字
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.9,max_features=15,min_df=0.1,ngram_range=(1,2),tokenizer=remove_noise)

In [None]:
# 将关每个类别的键字输出为词云图
from wordcloud import WordCloud
w = WordCloud(scale=2.5,background_color='white',relative_scaling=0,max_words=15)
for i in [0,1,2,3]:
    tfidf.fit_transform(df1[df1.Labels==i]['Description'])
    tfidf.get_feature_names()
    print('Label ',i,': \n')
    print(tfidf.get_feature_names())
    text=''
    for x in tfidf.get_feature_names():
        text = text + ' ' + x
    w.generate(text)
    plt.figure(figsize=(14,10),)
    plt.imshow(w)
    plt.show()