# Step 2: Feature Engineering

模型的目标是根据用户的第一次购买信息来预测是否会再次购买。

在上一步中，我们发现各商品是有类别的，因此不同的用户关注的商品类别是不同的，因此首先我们把商品根据关键词分门别类。另外一个原因是，假如不分类，一共3000多种商品，根据用户的购买行为来预测下一种商品显然是无法做到精确预测的，然而分组后，我们只需要预测下一次购买在5类商品的哪一类就行，可靠性更强。

## 2.1 商品特征
### 2.1.1 商品分类

In [17]:
# 导入相关modules
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import datetime



In [18]:
# 导入数据
df = pd.read_csv('./temp/cleaned_data.csv')
keywords = np.load('./temp/keywords.npy')[:193]

这里一共有295个关键词

如何分类？

用独热编码并用稀疏矩阵记录下来。

$$
 X = \left(\begin{matrix}
   x_{11} & x_{12} & x_{13} & \dots & x_{1M}\\
   x_{21} & x_{22} & x_{23} & \dots & x_{2M} \\
   \vdots & \vdots & \vdots & \vdots & \vdots \\
   x_{N1} & x_{N2} & x_{N3} & \dots & x_{NM}
  \end{matrix}\right) 
$$

其中$x_{nm}$表示第n商品中含有第m个关键词

In [19]:
# 创建稀疏矩阵
df["Description"] = df["Description"].apply(lambda l: l.lower())
description_diff = df["Description"].unique()
X = pd.DataFrame()
for key in keywords:
    X.loc[:, key] = list(map(lambda x: int(key in x), description_diff))

In [20]:
display(X.head(4))
display(description_diff[:4])

Unnamed: 0,heart,vintage,set,bag,box,glass,christmas,design,candle,holder,...,plant,diner,house,square,lace,wallet,point,flag,circus,bin
0,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


array(['white hanging heart t-light holder', 'white metal lantern',
       'cream cupid hearts coat hanger',
       'knitted union flag hot water bottle'], dtype=object)

与此同时，由于价格也带着商品类别的信息，比如同类别的商品具有相近的价格。因此我们在特征矩阵$X$上再增加价格区间特征。

In [21]:
threshold = [0, 1, 2, 3, 5, 10]
label_col = []
list_products = df['Description'].unique()
for i in range(len(threshold)):
    if i == len(threshold)-1:
        col = '.>{}'.format(threshold[i])
    else:
        col = '{}<.<{}'.format(threshold[i],threshold[i+1])
    label_col.append(col)
    X.loc[:, col] = 0

for i, prod in enumerate(list_products):
    prix = df[df['Description'] == prod]['UnitPrice'].mean()
    j = 0
    while prix > threshold[j]:
        j+=1
        if j == len(threshold): break
    X.loc[i, label_col[j-1]] = 1

KeyboardInterrupt: 

In [None]:
print("{:<8} {:<20} \n".format('区间', '商品数量') + 20 * '-')
for i in range(len(threshold)):
    if i == len(threshold)-1:
        col = '.>{}'.format(threshold[i])
    else:
        col = '{}<.<{}'.format(threshold[i],threshold[i+1])    
    print("{:<10}  {:<20}".format(col, X.loc[:, col].sum()))

接下来对不同商品之间特征距离的刻画并以此来聚类，这里我们采取k-means方法，距离用sklearn默认的欧氏距离。

In [None]:
matrix = X.values
for n_clusters in range(3,10):
    kmeans = KMeans(init='k-means++', n_clusters = n_clusters, n_init=30)
    kmeans.fit(matrix)
    clusters = kmeans.predict(matrix)
    silhouette_avg = silhouette_score(matrix, clusters)
    print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)

可以看到当用K-Means算法分成5组时，在silhouette_score的意义下最好。因此我们选择分成5组，接下来看一下每组中含有词的情况

In [None]:
n_clusters = 5
silhouette_avg = 0
while silhouette_avg < 0.14:
    kmeans = KMeans(init='k-means++', n_clusters = n_clusters, n_init=30)
    kmeans.fit(matrix)
    clusters = kmeans.predict(matrix)
    silhouette_avg = silhouette_score(matrix, clusters)
    print(silhouette_avg)
print(silhouette_avg)

In [None]:
pd.Series(clusters).value_counts()

In [None]:
list_des = pd.DataFrame(description_diff)

occurence = [dict() for _ in range(n_clusters)]

for i in range(n_clusters):
    list_cluster = list_des.loc[clusters == i]
    for word in keywords:
        if word in ['art', 'set', 'heart', 'pink', 'blue', 'tag']: continue
        occurence[i][word] = sum(list_cluster.loc[:, 0].str.contains(word))

In [None]:
def random_color_func(word=None, font_size=None, position=None,
                      orientation=None, font_path=None, random_state=None):
    h = int(360.0 * tone / 255.0)
    s = int(100.0 * 255.0 / 255.0)
    l = int(100.0 * float(random_state.randint(70, 120)) / 255.0)
    return "hsl({}, {}%, {}%)".format(h, s, l)

def make_wordcloud(liste, increment):
    ax1 = fig.add_subplot(4,2,increment)
    words = dict()
    trunc_occurences = liste[0:150]
    for s in trunc_occurences:
        words[s[0]] = s[1]
    wordcloud = WordCloud(width=1000,height=400, background_color='lightgrey', 
                          max_words=1628,relative_scaling=1,
                          color_func = random_color_func,
                          normalize_plurals=False)
    wordcloud.generate_from_frequencies(words)
    ax1.imshow(wordcloud, interpolation="bilinear")
    ax1.axis('off')
    plt.title('cluster nº{}'.format(increment-1))

fig = plt.figure(1, figsize=(14,14))
color = [0, 160, 130, 95, 280, 40, 330, 110, 25]
for i in range(n_clusters):
    list_cluster_occurences = occurence[i]

    tone = color[i] # define the color of the words
    liste = []
    for key, value in list_cluster_occurences.items():
        liste.append([key, value])
    liste.sort(key = lambda x:x[1], reverse = True)
    make_wordcloud(liste, i+1)       

Note: 通过K-means算法聚类后，这里我们可以看到一些合理性，例如一组跟礼物有关(keywords: Christmas, card, wood, gift...), 有一组跟首饰有关(keywords: necklace, lace, bracelet, silver, ...)。但是还是很多词同时出现在多个组中(keywords: vintage, glass, pot, bag)，这是一个后期可以优化的地方。

> 可以再可视化以下商品分类的结果！

### 2.1.2 订单特征

由于我们考虑的目标是只根据用户第一次购买的信息去预测接下来的购买行为。因此先把数据进行按人按订单的整理。

In [None]:
list_des = list_des.rename(columns={0:"Description"})
list_des["Cluster"] = clusters


In [None]:
desc_cluster = {}
for k, v in zip(description_diff, clusters):
    desc_cluster[k] = v
df["Cluster"] = df["Description"].map(desc_cluster)

In [None]:
df[["InvoiceNo", "StockCode", "Description", "Cluster"]]

In [None]:
# 对每行进行销售额统计
for i in range(n_clusters):
    col_name = "Cluster_{}".format(i)
    df_temp = df[df["Cluster"] == i]
    price_temp = df_temp["Quantity"] * df_temp["UnitPrice"]
    df[col_name] = price_temp
    df[col_name].fillna(0, inplace=True)
    
df["Total_price"] = df["Quantity"] * df["UnitPrice"]
df[["InvoiceNo", "Description", "Cluster", "Total_price", "Cluster_0", "Cluster_1", "Cluster_2", "Cluster_3", "Cluster_4"]].head(10)

In [None]:
# 对每个订单进行销售额统计
df_cluster = df.groupby(["CustomerID", "InvoiceNo"], as_index=False).Total_price.sum()
for i in range(n_clusters):
    col = "Cluster_{}".format(i)
    df_cluster.loc[:, col] = df.groupby(by=["CustomerID", "InvoiceNo"], as_index=False)[col].sum()

In [None]:
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])

In [None]:
# 添加日期信息
df['InvoiceDate_int'] = df['InvoiceDate'].astype('int64')
df_temp = df.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate_int'].mean()
df.drop('InvoiceDate_int', axis = 1, inplace = True)
df_cluster.loc[:, 'InvoiceDate'] = pd.to_datetime(df_temp['InvoiceDate_int'])

In [None]:
df_cluster.sort_values('CustomerID', ascending = True)[:5]

### 2.1.3 按时间分集
注意到这些订单的跨度为12个月，我们可以选择把前10个月的订单作为训练集，把后2个月的订单作为测试集

In [None]:
print(df_cluster['InvoiceDate'].min(), '->',  df_cluster['InvoiceDate'].max())

In [None]:
set_training = df_cluster[df_cluster['InvoiceDate'] < pd.to_datetime(datetime.date(2011,10,1))]
set_testing = df_cluster[df_cluster['InvoiceDate'] >= pd.to_datetime(datetime.date(2011,10,1))]
df_cluster = set_training.copy(deep = True)

## 2.2 用户特征

In [None]:
print("用户数量:", df.CustomerID.nunique())

由于用户数量达4300位，因此把用户行为相近的用户分类再进行接下来的预测是一个可取的选择。

### 2.2.1 用户特征整理

In [None]:
df_customer = df_cluster.groupby(by=["CustomerID"], as_index=False).InvoiceNo.count().rename(columns={"InvoiceNo":"count"})
df_customer.loc[:, ["Total_price", "Cluster_0", "Cluster_1", "Cluster_2", "Cluster_3", "Cluster_4"]] = df_cluster.groupby(by=["CustomerID"], as_index=False)["Total_price", "Cluster_0", "Cluster_1", "Cluster_2", "Cluster_3", "Cluster_4"].sum()

In [None]:
last_date = df_cluster['InvoiceDate'].max().date()

first_registration = pd.DataFrame(df_cluster.groupby(by=['CustomerID'])['InvoiceDate'].min())
last_purchase      = pd.DataFrame(df_cluster.groupby(by=['CustomerID'])['InvoiceDate'].max())

test  = first_registration.applymap(lambda x:(last_date - x.date()).days)
test2 = last_purchase.applymap(lambda x:(last_date - x.date()).days)

df_customer.loc[:, 'LastPurchase'] = test2.reset_index(drop = False)['InvoiceDate']
df_customer.loc[:, 'FirstPurchase'] = test.reset_index(drop = False)['InvoiceDate']

df_customer[:5]

再添加一些用户特征, 如每个用户的最小消费额，最大消费额，平均消费额

In [None]:
df_customer.loc[:, ["min", "max", "mean"]] = df_cluster.groupby(by=["CustomerID"], as_index=False).Total_price.agg(["min","max","mean"]).reset_index(drop=False)
df_customer.head(5)

In [None]:
n1 = df_customer[df_customer['count'] == 1].shape[0]
n2 = df_customer.shape[0]
print("只够买过一次的用户占总用户的比例: {:<2}/{:<5} ({:<2.2f}%)".format(n1,n2,n1/n2*100))

可以看出只够买一次的用户非常多，占比40%，运营过程中一个目标就是如何留住这些客户。

### 2.2.2 对用户进行分类
选取有意义的特征，对特征进行标准化，再通过PCA降维，再通过k-Means算法对用户进行无监督学习

In [None]:
customer_col = ["count", "min", "max", "mean", "Total_price", "Cluster_0", "Cluster_1", "Cluster_2", "Cluster_3", "Cluster_4"]

In [None]:
matrix = df_customer[customer_col].copy(deep=True).values

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
from matplotlib.font_manager import FontProperties
font_set = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=12)

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime, nltk, warnings
import matplotlib.cm as cm
import itertools
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import preprocessing, model_selection, metrics, feature_selection
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn import neighbors, linear_model, svm, tree, ensemble
from wordcloud import WordCloud, STOPWORDS
from sklearn.ensemble import AdaBoostClassifier
from IPython.display import display, HTML
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
init_notebook_mode(connected=True)
warnings.filterwarnings("ignore")
plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
%matplotlib inline

In [None]:
scaler = StandardScaler()
scaler.fit(matrix)
print('variables mean values: \n' + 90*'-' + '\n' , scaler.mean_)
scaled_matrix = scaler.transform(matrix)

In [None]:
pca = PCA()
pca.fit(scaled_matrix)
pca_samples = pca.transform(scaled_matrix)

In [None]:
fig, ax = plt.subplots(figsize=(14, 5))
sns.set(font_scale=1)
plt.step(range(matrix.shape[1]), pca.explained_variance_ratio_.cumsum(), where='mid')
sns.barplot(np.arange(1,matrix.shape[1]+1), pca.explained_variance_ratio_, alpha=0.5, color = 'g')
plt.xlim(0, 10)

ax.set_xticklabels([s if int(s.get_text())%2 == 0 else '' for s in ax.get_xticklabels()])

plt.ylabel('解释方差', fontsize = 14, fontproperties=font_set)
plt.xlabel('主成分', fontsize = 14, fontproperties=font_set)
plt.legend(loc='best', fontsize = 13);

In [None]:
pca.explained_variance_ratio_.cumsum()

由此我们可以看到当主成分为6个时，累计解释方差已经大于0.97，因此我选取前6大主成分代替10个原特征进行接下来的k-means分组。

In [None]:
pca_selected = pca_samples[:, :5]
np.shape(pca_selected)

In [None]:
for i in range(5,14):
    n_clusters = i
    kmeans = KMeans(init='k-means++', n_clusters = n_clusters, n_init=100)
    kmeans.fit(scaled_matrix)
    clusters_clients = kmeans.predict(scaled_matrix)
    silhouette_avg = silhouette_score(scaled_matrix, clusters_clients)
    print('silhouette score: {:<.3f}'.format(silhouette_avg))

这里经过调试发现，在组数为15时，各组人数相对分散。

In [None]:
n_clusters = 15
kmeans = KMeans(init='k-means++', n_clusters = n_clusters, n_init=300)
kmeans.fit(scaled_matrix)
clusters_clients = kmeans.predict(scaled_matrix)
silhouette_avg = silhouette_score(scaled_matrix, clusters_clients)
print('silhouette score: {:<.3f}'.format(silhouette_avg))

In [None]:
pd.DataFrame(pd.Series(clusters_clients).value_counts(), columns = ["客户人数"]).T

In [None]:
group_num = list(pd.Series(clusters_clients).value_counts()[pd.Series(clusters_clients).value_counts()>50].index)

In [None]:
mat = pd.DataFrame(pca_selected)
mat['cluster'] = pd.Series(clusters_clients)
mat50 = mat[mat['cluster'].apply(lambda l: l in group_num)]
mat50

In [None]:
import matplotlib.patches as mpatches

sns.set_style("white")
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 2.5})

LABEL_COLOR_MAP = {0:'r', 1:'tan', 2:'b', 3:'k', 4:'c', 5:'g', 6:'deeppink', 7:'skyblue', 8:'darkcyan', 9:'orange',
                   10:'yellow', 11:'tomato', 12:'seagreen', 13:"#9b59b6", 14:"#3498db"}
label_color = [LABEL_COLOR_MAP[l] for l in mat50['cluster']]

fig = plt.figure(figsize = (12,10))
increment = 0
for ix in range(6):
    for iy in range(ix+1, 6):   
        increment += 1
        ax = fig.add_subplot(4,3,increment)
        ax.scatter(mat50[ix], mat50[iy], c= label_color, alpha=0.5) 
        plt.ylabel('PCA {}'.format(iy+1), fontsize = 12)
        plt.xlabel('PCA {}'.format(ix+1), fontsize = 12)
        ax.yaxis.grid(color='lightgray', linestyle=':')
        ax.xaxis.grid(color='lightgray', linestyle=':')
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        
        if increment == 12: break
    if increment == 12: break
        
#_______________________________________________
# I set the legend: abreviation -> airline name
comp_handler = []
for i in range(n_clusters):
    comp_handler.append(mpatches.Patch(color = LABEL_COLOR_MAP[i], label = i))

plt.legend(handles=comp_handler, bbox_to_anchor=(1.1, 0.9), 
           title='Cluster', facecolor = 'lightgrey',
           shadow = True, frameon = True, framealpha = 1,
           fontsize = 13, bbox_transform = plt.gcf().transFigure)

plt.tight_layout()