In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import networkx as nx 
import matplotlib.pyplot as plt


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory


# Any results you write to the current directory are saved as output.

# 数据预处理阶段

## 导入数据
在开始阶段，我们认为可能会用到的数据有`Teams.csv` \ `TeamMemberships.csv` \ `Competitions.csv` \ `Users.csv`，于是我们将这些数据先导入。

In [None]:
TeamData = pd.read_csv('../input/Teams.csv',index_col='Id')
TeamMembershipData = pd.read_csv('../input/TeamMemberships.csv',index_col = 'Id')
CompetitionData = pd.read_csv('../input/Competitions.csv',index_col = 'Id')
UserData = pd.read_csv('../input/Users.csv',index_col='Id')

查看数据结构

In [None]:
TeamMembershipData.head(5)

我们可以看到TeamMemberShip有1,341,819列数据，其中RequestDate列数据少于另外两列数据，可能是因为缺失值的存在。

In [None]:
TeamMembershipData.info()

In [None]:
TeamData.info()

## TeamMemberShip特征工程
在TeamMembership的特征工程中，我们首先通过查看哪一列有缺失值，再去考虑如何处理缺失数据

In [None]:
TeamMembershipData.isnull().any()

查看缺失值的比重，确定处理方式。

In [None]:
TeamMembershipData.RequestDate.isna().value_counts()

In [None]:
TeamMembershipData.RequestDate.notna().sum()/len(TeamMembershipData)

我们对比了在`TeamMembership`中的数据，发现缺失的数据只占全部数据的很少一部分（0.5%）；在此我们想到两种处理方法：
- 直接去掉，因为所占比重不多，不会破坏很多数据。
- 用前后值来进行填充。

后来我们发现缺失值有个特点——大部分时间的缺失其ID都较小，在此我们认为是由于统计关系创建时间这一功能是在后期加入的功能，在前期没有收集这个数据，也就导致了部分数据的缺失。
那么有一个队伍创立了很长时间但在清洗中被删掉是不合理的，所以我们将时间中出现的第一个数据作为填补值进行填补。

In [None]:
TeamMembershipData.fillna(method='bfill', axis = 0,inplace=True)

### TeamData特征工程

之后我们对`TeamData`做一下特征工程

In [None]:
TeamData.isnull().any()

TeamLeader列表示的是队伍的领导者，而在此仍有很多缺失，我们认为这可能是由于队伍该换了队名等原因，队伍任然存在在数据库中，但却没有了团队的领导者。

In [None]:
TeamData.TeamLeaderId.isnull().value_counts()

In [None]:
TeamData[TeamData.TeamLeaderId.isnull()].head(10)

通过观察整体数据，大部分队伍连排名都没有，我们认为这样的数据没有太多价值，于是将没有Leader的队伍全部删掉。

In [None]:
missval_index_1 = TeamData[TeamData.TeamLeaderId.isnull()].index
TeamData.drop(missval_index_1,inplace=True)

#### TeamData Medal 生成探究

以下部分的代码是为了确定Teams数据的生成过程以及缺失值产生的过程，在此我选用了比赛Id为2349的数列，在此发现当授予日起有值的时候一共有四种可能。

In [None]:
test = pd.read_csv('../input/Teams.csv')
sit1 = pd.concat([test.CompetitionId,test.MedalAwardDate,test.Medal],axis = 1)
data2 = sit1[sit1.CompetitionId==2439]
data2.MedalAwardDate.value_counts()

在此我们推测，可能在比赛的过程中一二三等奖是分批发送的，可能主办方也会根据其需求来分不同时间来发布一二三等奖。

而最后奖项授予时间也有不同的表现，诸如有的`Medal`列为空，但`MedalAwardDate`列仍有值的存在，在此我推测原因可能是有些人完成了比赛却没有获得奖章，但他仍有获奖列这一属性。

所以没有完成比赛的`Medal`和`MedalAwardDate`都为空。


In [None]:
data2.MedalAwardDate.isnull().value_counts()

In [None]:
data2.Medal.value_counts()

此外，TeamData数据中，大部分队伍都没有获得过奖牌，通过查看各种该资料我们了解到，在kaggle比赛中，大多数队伍都无法获奖；即使有提交代码或者比赛结果。

但在我们进行特征工程时，数据大片的缺失需要我们进行填制处理。我们的注意到:在没有获奖的情况中，参加比赛并提交参赛结果与参加比赛但没有提交参赛结果在ScoreFirstSubmittedDate上的表现是不同的（提交了参赛结果，该列属性不为空）。

由此看来，我们将奖牌的空值填补为0和-1（奖牌一共有三等，4是完成了比赛但没有奖项的意思，5是参与但未完成 比赛的意思）。

In [None]:
index_1_val = TeamData.ScoreFirstSubmittedDate.isnull()

TeamData[index_1_val].Medal = TeamData[index_1_val].Medal.fillna(5)
TeamData.MedalAwardDate.fillna('1/1/2199',inplace = True)
TeamData.Medal.fillna(4,inplace = True)

TeamData.PublicLeaderboardRank.isnull().value_counts()

TeamData.PrivateLeaderboardRank.isnull().value_counts()

看了一下Public和Private中空值所占的比重（大概都在80%左右浮动），我们认为直接删去这部分数据太可惜了，我的想法有以下两种：
1. 用一个极大值来代替，诸如`1/1/2199`
2. 用（该行）最大值+1来代替

此外还有`MedalAwardDate	`（授予日期）这一个属性，我认为要可视化做得好的话就先不要删除，也赋予一个极大值`1/1/2199`

同理 提交日期我们也设置为`1/1/2199`（未完成比赛）

In [None]:
TeamData.PublicLeaderboardRank.fillna('99999',inplace = True)
TeamData.PrivateLeaderboardRank.fillna('99999',inplace = True)
TeamData.ScoreFirstSubmittedDate.fillna('1/1/2199',inplace = True)
TeamData.LastSubmissionDate.fillna('1/1/2199',inplace = True)

因为提交Id是一个随机值，对统计来说都是不一样而且无意义的，再次我们去掉提交Id信息

In [None]:
cols = ['PublicLeaderboardSubmissionId','PrivateLeaderboardSubmissionId']
TeamData.drop(cols,axis = 1,inplace = True)

TeamData.head(10)

## 拼接数据

In [None]:
TeamData.index.name = 'TeamId'

MergeValue = pd.merge(TeamData,TeamMembershipData,on = 'TeamId',how='right')
MergeValue.drop('TeamName',inplace=True,axis =1)

MergeValue.info()

In [None]:
MergeValue.to_csv('TeamRelation.csv')

In [None]:
MergeValue.isnull().any()

In [None]:
groupresult =MergeValue.groupby(['UserId','TeamLeaderId'])
MergeValue = pd.DataFrame(groupresult.Medal.min()).reset_index()
TierMerge = pd.DataFrame(UserData['PerformanceTier'])
TierMerge.index.name= 'TeamLeaderId'

result = pd.merge(TierMerge,MergeValue,on='TeamLeaderId',how='right').rename(columns={'PerformanceTier':'LeaderTier'})

result.drop('TeamLeaderId',axis = 1,inplace = True)

result.LeaderTier.isnull().value_counts()

result.dropna(inplace=True)

## 保存数据

In [None]:
result.to_csv('TeamLeaderTier.csv')

# 为什么要对比赛做聚类？

比赛是Kaggle 的核心，不同比赛对参赛者的能力考察也各有侧重（诸如NLP、图像识别等领域的比赛对参赛者的要求也都不相同）；我们希望在最后进行预测的数据中能够考虑到比赛的信息。

但如何在最后的信息中包含比赛信息呢？

如果针对每个比赛都在最后的表中添加一列信息（参与该比赛的用户表示为1，未参与的表示为0），则在最后的表中会用很多无用信息。

我们原本寄希望于按照CompetitionData中的Tags信息对比赛进行分类，但是发现该信息是人为自由设置的，而且大部分Tags信息是缺失的，所以按照Tags对比赛进行分类是不可行的。

而最后经过考虑我们认为可以对比赛进行聚类，按照比赛的数学特诊来对比赛进行分类，词句是因为选手选择比赛也不一定完全受比赛领域的影响，甚至还有更多的影响因素，诸如参与人数（热门比赛的从众心理）、有无Kernel（可以查看高分选手的kernel）。

于是我们认为可以通过统计学的方法进行度量，在数学上对比赛进行聚类，得出比赛分类，这也是为什么我们队比赛进行特征工程兵聚类的原因。

# 导入数据

In [None]:
competitionData = pd.read_csv('../input/Competitions.csv')
competitionData.head(5)

In [None]:
competitionData.info()

In [None]:
competitionData.columns

In [None]:
competitionData.HostSegmentTitle.value_counts()

# 比赛信息特征工程

## 查看列属性

Competitions表中由极多列，但大多数列都是没什么用处的，但鉴别有用没用则需要我们认为去判断。

In [None]:
competitionData.columns

## 查看缺失列的信息

In [None]:
competitionData.isnull().any()

### FormId

In [None]:
competitionData.ForumId.isnull().value_counts()

In [None]:
competitionData.ForumId.fillna(0,inplace=True)

在种类Id中，有8缺失，我们用0来填补缺失值。

In [None]:
competitionData.Subtitle.isnull().value_counts()

In [None]:
competitionData.Subtitle.value_counts()

In [None]:
competitionData.drop('Subtitle',axis =1 ,inplace = True)

因Subtitle基本为长句子，目前不具备分析的能力，在特征工程中我们将其去掉。

### OrganizationId

In [None]:
competitionData.OrganizationId.isnull().value_counts()

In [None]:
competitionData.OrganizationId.fillna(0,inplace=True)

在组织Id中，有115缺失，我们用0来填补缺失值，表示无组织。

### HostName

In [None]:
competitionData.HostName.value_counts()

In [None]:
competitionData.HostName.isnull().value_counts()

In [None]:
competitionData.HostName.fillna('None',inplace=True)

在组织Id中，有346缺失，我们用0来填补缺失值，表示主办者。

### LeaderboardDisplayFormat

In [None]:
competitionData.LeaderboardDisplayFormat.value_counts()

该属性无缺失值，我们不做处理

### EvaluationAlgorithmName 

In [None]:
competitionData.EvaluationAlgorithmName.value_counts()

In [None]:
competitionData.EvaluationAlgorithmName.isnull().value_counts()

In [None]:
competitionData.EvaluationAlgorithmName.fillna('None',inplace = True)

该列有41缺失，我们填补'None'字符，表示无的意思

### ValidationSetName&setValue

In [None]:
competitionData.ValidationSetName.value_counts()

In [None]:
competitionData.ValidationSetName.isnull().value_counts()

In [None]:
competitionData.ValidationSetName.fillna('None',inplace=True)

该列有305缺失，我们填补'None'字符，表示无的意思

### MaxTeamsize

In [None]:
competitionData.MaxTeamSize.value_counts()

In [None]:
competitionData.MaxTeamSize.isnull().value_counts()

In [None]:
competitionData.MaxTeamSize.fillna(-1,inplace=True)

我们将此缺失值表示为-1

### RewardType

In [None]:
competitionData.RewardType.value_counts()

In [None]:
competitionData.RewardType.isnull().value_counts()

In [None]:
competitionData.RewardType.fillna('None',inplace=True)

该列有267缺失，我们填补'None'字符，表示无的意思

### RewardQuantity

In [None]:
competitionData.RewardQuantity.isnull().value_counts()

In [None]:
competitionData.RewardQuantity.value_counts()

In [None]:
competitionData.RewardQuantity.fillna(-1,inplace = True)

该列有119缺失，我们填补-1

### EnableDate & DeadLineDate

这一操作是对时间变量进行处理，将`30/10/2018`的日期型数据转化为`20184`其中4代表的是第四季度，将时间颗粒度由日期变为季度。并转化为str型数据

In [None]:
enableTimeData = competitionData.EnabledDate.str.split('/|\s',expand=True)[[0,2]]
enableTimeData[0] = enableTimeData[0].astype('int')//4+1
competitionData.EnabledDate = enableTimeData[2].astype('str')+enableTimeData[0].astype('str')

deadLineTimeData = competitionData.DeadlineDate.str.split('/|\s',expand=True)[[0,2]]
deadLineTimeData[0] = enableTimeData[0].astype('int')//4+1
competitionData.DeadlineDate = deadLineTimeData[2].astype('str')+deadLineTimeData[0].astype('str')

competitionData.head(5)

## 去掉没有意义的列

到此，我们填补了必要的缺失值，对于仍有缺失的列，我们经过讨论无太多意义，将其删除。

In [None]:
dropCols = competitionData.columns[competitionData.isnull().any()]

In [None]:
competitionData.drop(dropCols,inplace=True,axis =1)

In [None]:
competitionData.isnull().any()

## 查看Object列

In [None]:
competitionData.head(5)

因为Slug与Title在所有数据中都不相同，无统计意义，所以删掉这两列值

In [None]:
competitionData.dtypes

In [None]:
competitionData.HostSegmentTitle.value_counts()

In [None]:
competitionDataResult = competitionData.drop(['ForumId','OrganizationId','ForumId','Slug','Title','EvaluationAlgorithmAbbreviation','EvaluationAlgorithmName','HostName'],axis = 1)
competitionDataResult[['EnabledDate','DeadlineDate']] = competitionDataResult[['EnabledDate','DeadlineDate']].astype('int64')
ObjectColumns = competitionDataResult.columns[(competitionDataResult.dtypes=='object').values]
competitionDataResult[ObjectColumns]

# 离散数据编码

为使模型能够处理离散型数据，我们用热编码对数据进行处理

## 数据整理

In [None]:
competitionDataResult.info()

查看不同类型的数据列，此举是为了将那些有数值意义但却被归为str型数据的数据挑选出来。

## 查看int型数据

In [None]:
competitionDataResult[competitionDataResult.columns[competitionDataResult.dtypes=='int64']].head(5)

## 查看float数据

In [None]:
competitionDataResult[competitionDataResult.columns[competitionDataResult.dtypes=='float64']].head(5)

In [None]:
competitionDataResult[competitionDataResult.columns[competitionDataResult.dtypes=='float64']].UserRankMultiplier.value_counts()

## 查看bool型数据

In [None]:
competitionDataResult[competitionDataResult.columns[competitionDataResult.dtypes=='bool']].head(5)

In [None]:
competitionDataResult.drop('CompetitionTypeId',axis = 1,inplace=True)

In [None]:
competitionDataResult[competitionDataResult.columns[competitionDataResult.dtypes=='bool']] = competitionDataResult[competitionDataResult.columns[competitionDataResult.dtypes=='bool']].astype('int')

## 查看object行数据

In [None]:
competitionDataResult[competitionDataResult.columns[competitionDataResult.dtypes=='object']]

## HotXCode

In [None]:
competitionDataResult.set_index('Id',inplace=True)

In [None]:
competitionDataResult

In [None]:
HotCodeData = pd.get_dummies(competitionDataResult)

In [None]:
HotCodeData

# Save Competition Data

In [None]:
HotCodeData.to_csv('CompetitionData_hotCode.csv')

# 比赛信息聚类思路

在比赛信息的聚类中，我们按照以下流程对热编码后的数据进行处理
1. 选择数据
2. 归一化
3. PCA
4. 模型选择

    4.1 KMeans
    
    4.2 层次聚类
    
    4.3 DBSCAN

## 不足之处

在本次聚类操作中，我们导入了热编码后的数据，一共有515个特征，但是经过特征选择之后保留了38列数据，之后在经过PCA，只剩下了23列数据，说实话维度还是很高。

之后我打算做一组对照组，减少特征数量。

# 导入需要的包

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import datetime as dt

# 导入数据

In [None]:
CompetitionData_HotCode = pd.read_csv('CompetitionData_hotCode.csv',index_col=0)
CompetitionData_HotCode.head(5)

In [None]:
CompetitionData_HotCode.info()

查看开始导入的数据，我们发现数据有514列,肯定要在开始时对特征进行选择。

# Select Feature

我们认为在数据中出现次数小于0.05的数据对整体来说是无意义的，在进行特征选择的时候就将其去掉

In [None]:
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=(0.90*(1-0.90)))
CompetitionData_selFeature = pd.DataFrame(sel.fit_transform(CompetitionData_HotCode),index=CompetitionData_HotCode.index)

CompetitionData_selFeature.head(5)

# Normalize

## 对数据进行Normalize

In [None]:
from sklearn.preprocessing import scale

CompetitionData_Scale = pd.DataFrame(scale(CompetitionData_selFeature.values),index=CompetitionData_selFeature.index)

CompetitionData_Scale

# PCA

## 保留0.9特征的降维

我们对归一化后的数据进行PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=0.90)
pca.fit(CompetitionData_Scale)

print(pca.n_components)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_)

pca.explained_variance_.shape

CompetitionData_Scale_PCA = pd.DataFrame(pca.transform(CompetitionData_Scale),index=CompetitionData_Scale.index)

CompetitionData_Scale_PCA.head(5)

查看目前数据的大小

In [None]:
CompetitionData_Scale_PCA.shape

## 三维可视化

### 将数据降到三维

In [None]:
pca = PCA(n_components=3)
pca.fit(CompetitionData_Scale)

In [None]:
print(pca.n_components)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_)

In [None]:
CompetitionData_PCA_3D = pd.DataFrame(pca.transform(CompetitionData_Scale),index=CompetitionData_Scale.index)

In [None]:
CompetitionData_PCA_3D.head(5)

In [None]:
CompetitionData_PCA_3D.shape

### 可视化表示

In [None]:
from mpl_toolkits.mplot3d import Axes3D 

In [None]:
fig = plt.figure(figsize=(7,5))
ax = Axes3D(fig)
x = CompetitionData_PCA_3D[0]
y = CompetitionData_PCA_3D[1]
z = CompetitionData_PCA_3D[2]
ax.scatter(z,x,y)
ax.set_zlabel('Z', fontdict={'size': 15, 'color': 'red'})
ax.set_ylabel('Y', fontdict={'size': 15, 'color': 'red'})
ax.set_xlabel('X', fontdict={'size': 15, 'color': 'red'})
plt.show()


# KMeans

In [None]:
from sklearn.cluster import MiniBatchKMeans
from scipy.spatial.distance import cdist

## K-Means K与Compactness的可视化函数

In [None]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

def Visual_K_and_Meandisortions(X_value,start,end):
    start_time = dt.datetime.now()

    K = range(start,end)
    meandisortions  = []
    for k in K:
        costFunction = 0
        for i in range(40):
            Minkmeams = MiniBatchKMeans(n_clusters=k)
            Minkmeams.fit(X_value)
            costFunction += sum(np.min(
                cdist(X_value,Minkmeams.cluster_centers_,'euclidean'),axis = 1))/X_value.shape[0]
        costFunction/=40
        meandisortions.append(costFunction)
        print('%d=='%k,end='=')

    end_time = dt.datetime.now()
    print((end_time - start_time).seconds)
    plt.plot(K,meandisortions,'gx-')
    plt.xlabel('k')
    plt.ylabel('Mean Cost Function')
    plt.show()
    return Minkmeams,plt,meandisortions

## 迭代出最优K值

在该步骤中，我们希望通过改变k值大小进行迭代，迭代出函数内部分离性最小的k值

### Meandisortions

In [None]:
Visual_K_and_Meandisortions(CompetitionData_Scale_PCA,1,40)

## Slihouette Cofficient

在评价模型的好坏时，我们选择轮廓系数作为你评价指标，其公式如下：

$s(i)=\frac{b(i)-a(i)}{max{a(i)-b(i)}}$

轮廓系数介于[-1,1]之间，轮廓系数越接近1表示分类越合理，越接近-1表示分类越不合理

### K-Means与Slihouette Cofficient

In [None]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist

def Visual_K_and_silhouette(X_value,start,end):
    start_time = dt.datetime.now()

    K = range(start,end)
    KMeans_silhouette_re  = []
    for k in K:
        KMeans_silhouette = 0
        for i in range(0,70):
            Minkmeams = MiniBatchKMeans(n_clusters=k)
            Minkmeams.fit(X_value)
            KMeans_silhouette += silhouette_score(X_value,Minkmeams.labels_)
        KMeans_silhouette /= 70
        KMeans_silhouette_re.append(KMeans_silhouette)
        print('%d=='%k,end='=')

    end_time = dt.datetime.now()
    print((end_time - start_time).seconds)
    plt.plot(K,KMeans_silhouette_re,'gx-')
    plt.xlabel('k')
    plt.ylabel('silhouette_score')
    plt.show()
    return Minkmeams,plt,KMeans_silhouette_re

In [None]:
Mimi,_,KMeans_silhouette_re = Visual_K_and_silhouette(CompetitionData_Scale_PCA,2,40)

In [None]:
Mimi,_,KMeans_silhouette_re = Visual_K_and_silhouette(CompetitionData_Scale_PCA,2,40)

# 层次聚类

## Agglomerative Cluster & Slihoutte Conficient 可视化

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

def Visual_K_and_AggCluster_silhouette(X_value,start,end):
    start_time = dt.datetime.now()

    K = range(start,end)
    AgglomerativeCluster_silhouette_re  = []
    for k in K:

        AgglomerativeCluster = AgglomerativeClustering(linkage='average',n_clusters=k)
        AgglomerativeCluster.fit(X_value)
        AgglomerativeCluster_silhouette = silhouette_score(X_value,AgglomerativeCluster.labels_,metric='euclidean',random_state=0)
        AgglomerativeCluster_silhouette_re.append(AgglomerativeCluster_silhouette)
        print('%d=='%k,end='=')

    end_time = dt.datetime.now()
    print((end_time - start_time).seconds)
    plt.plot(K,AgglomerativeCluster_silhouette_re,'gx-')
    plt.xlabel('k')
    plt.ylabel('silhouette_score')
    plt.show()
    return AgglomerativeCluster_silhouette,plt,AgglomerativeCluster_silhouette_re

## 通过迭代选出簇的数目

In [None]:
_,_,AgglomerativeCluster_silhouette_re = Visual_K_and_AggCluster_silhouette(CompetitionData_Scale_PCA,2,40)

### 对二者可视化表示

In [None]:
x= range(2,40)
y1 = KMeans_silhouette_re
y2 = AgglomerativeCluster_silhouette_re

plt.plot(x,y1,'ro-')
plt.plot(x,y2,'bx-',label = 'AgglomerativeCluster')

plt.legend((u'KMeans', u'Agglomerative Cluster'),loc='best')# sets our legend for our graph.


plt.xlabel('Cluster number')
plt.ylabel('Silhouette')
plt.show()

# AP聚类

In [None]:
from sklearn.cluster import AffinityPropagation

AP = AffinityPropagation()
AP.fit(CompetitionData_Scale_PCA)

In [None]:
AP.cluster_centers_indices_ 

In [None]:
AP.labels_

In [None]:
import seaborn as sns 
sns.distplot(AP.labels_)

AP聚类聚出了99个类，虽然效果很强，但不符合我们的要求。

# BIRCH聚类算法（层次聚类+树）

In [None]:
from sklearn.cluster import Birch
bir = Birch(branching_factor=10,n_clusters=10)
bir.fit(CompetitionData_Scale_PCA)

In [None]:
bir.predict(CompetitionData_Scale_PCA)


## 可视化表示层次聚类

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist,squareform
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering

plt.figure(figsize=(15,7))

#计算距离关联矩阵，两两样本间的欧式距离
row_clusters = linkage(pdist(CompetitionData_Scale_PCA,metric='euclidean'),method='complete')#使用抽秘籍距离矩阵
#层次聚类树
row_dendr = dendrogram(row_clusters,labels=CompetitionData_Scale_PCA.index)

plt.ylabel('Euclidean distance')

In [None]:
AgglomerativeCluster = AgglomerativeClustering(linkage='average',n_clusters=8)
AgglomerativeCluster.fit(CompetitionData_Scale_PCA)

# DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
dbscan = DBSCAN(eps=0.3,min_samples=6)
dbscan.fit(CompetitionData_Scale_PCA)
print(dbscan)
print(dbscan.labels_)

## DBSCAN可视化表示

### 将数据降到三维

In [None]:
VisualData_PCA_Model = PCA(n_components=3)
VisualData_PCA_Model.fit(CompetitionData_Scale)
VisualData = VisualData_PCA_Model.transform(CompetitionData_Scale)
VisualData_3D = pd.DataFrame(VisualData,index=CompetitionData_Scale.index)
VisualData_3D.head(5)

In [None]:
def DBSCAN_Result_Visual(dbscan,VisualData,CompetitionData_DBSCAN_To_Visual):
    dbscan_labels = pd.DataFrame(dbscan.labels_,CompetitionData_DBSCAN_To_Visual.index).rename(columns={0:'Labels'})
    CompetitionData_DBSCAN_To_Visual_AfMerge = pd.merge(CompetitionData_DBSCAN_To_Visual,dbscan_labels,on='Id')
    colors = np.array(['#DC143C','#FF69B4','#DA70D6','#EE82EE','#8B008B','#9400D3','#8A2BE2','#6A5ACD','#F8F8FF','#191970','#4169E1',
                       '#778899','#F0F8FF','#87CEEB','#B0E0E6','#E1FFFF','#00FFFF','#008B8B','#20B2AA','#00FA9A','#3CB371','#90EE90',
                       '#32CD32','#008000','#7CFC00','#F5F5DC','#FFFFE0','#BDB76B','#F0E68C','#DAA520','#F5DEB3','#FFEFD5','#FAEBD7',
                       '#FFE4C4','#CD853F','#D2691E','#A0522D','#FF4500','#FFE4E1','#F08080','#FF0000','#8B0000','#F5F5F5','#C0C0C0',
                       '#696969',
    ])
    CompetitionData_DBSCAN_To_Visual_AfMerge.Labels = colors[CompetitionData_DBSCAN_To_Visual_AfMerge.Labels]

    # 可视化模块
    fig = plt.figure(figsize=(10,7))
    ax = Axes3D(fig)
    x = VisualData[0]
    y = VisualData[1]
    z = VisualData[2]
    c = CompetitionData_DBSCAN_To_Visual_AfMerge.Labels
    ax.scatter(z,x,y,c=c)
    #ax.set_zlabel('Z', fontdict={'size': 15, 'color': 'red'})
    #ax.set_ylabel('Y', fontdict={'size': 15, 'color': 'red'})
    #ax.set_xlabel('X', fontdict={'size': 15, 'color': 'red'})
    plt.show()


In [None]:
DBSCAN_Result_Visual(dbscan,VisualData_3D,CompetitionData_Scale_PCA)

### 测试Slihoutte随参数的改变

制作参数集

In [None]:
result = pd.DataFrame()
for i in range(1,15):
    if i == 1:
        result = pd.DataFrame([np.arange(0.1,1.6,0.1),np.ones(15)*i])
    else:
        result = pd.concat((result,pd.DataFrame([np.arange(0.1,1.6,0.1),np.ones(15)*i])),axis = 1)
result = result.T
result.reset_index(inplace=True)
result.drop('index',axis = 1,inplace=True)
result.index.name = 'Id'

声明函数

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D 

def Silhoutte_DBSCAN(list,values):
    result = []
    for i in range(0,len(list)):
        dbscan = DBSCAN(eps = list[0][i],min_samples = list[1][i])
        dbscan.fit(values)
        try:
            result.append(silhouette_score(values,dbscan.labels_,metric='euclidean'))
        except:
            result.append(0)
            pass
    MergeValue = pd.DataFrame(result,index=list.index)
    MergeValue.index.name='Id'
    MergeValue.rename(columns={0:'Slihoutte'},inplace=True)
    VisualValue = pd.merge(list,MergeValue,on='Id')
    
    fig = plt.figure(figsize=(7,5))
    ax = Axes3D(fig)
    x = VisualValue[0]
    y = VisualValue[1]
    z = VisualValue.Slihoutte
    ax.scatter3D(x,y,z)
    #ax.set_zlabel('Z', fontdict={'size': 15, 'color': 'red'})
    #ax.set_ylabel('Y', fontdict={'size': 15, 'color': 'red'})
    #ax.set_xlabel('X', fontdict={'size': 15, 'color': 'red'})
    return VisualValue


In [None]:
TDVisualData = Silhoutte_DBSCAN(result,CompetitionData_Scale_PCA)

# 模型选择

## 模型及参数的选择

从Silhouette参数看我们选择层次聚类作为聚类方法，k= 4

In [None]:
AgglomerativeCluster = AgglomerativeClustering(linkage='average',n_clusters=4)
AgglomerativeCluster.fit(CompetitionData_Scale_PCA)
AgglomerativeCluster_silhouette = silhouette_score(CompetitionData_Scale_PCA,AgglomerativeCluster.labels_,random_state=0)

pd.DataFrame(AgglomerativeCluster.labels_)[0].value_counts()

In [None]:
CompetitionData_Scale_PCA.iloc[10:20]

In [None]:
list(range(6))

In [None]:
birchCluster2 = Birch(n_clusters=4)
silhouette = 0
for i in range(len(CompetitionData_Scale_PCA)//1000):
    temp = CompetitionData_Scale_PCA.iloc[i*1000:(i+1)*1000]
    birchCluster2.partial_fit(temp)
    silhouette += silhouette_score(temp,birchCluster2.labels_,metric='euclidean')
    print(i,end='-')
silhouette/(len(CompetitionData_Scale_PCA)//1000+1)

In [None]:
birchCluster1 = Birch(n_clusters=4)
birchCluster1.fit(CompetitionData_Scale_PCA)
silhouette_score(CompetitionData_Scale_PCA,birchCluster1.labels_,metric='euclidean')

In [None]:
birchCluster = Birch(n_clusters=4)
birchCluster.partial_fit(CompetitionData_Scale_PCA.iloc[21:30])

In [None]:
birchCluster.labels_

~~但是层次聚类并没有讲类分离开，不符合我们现在的状况。~~

我们注意到一个点成了一类，那么是不是样本中有异常值点？

对此我们打算在去异常值之后再做一次聚类。

In [None]:
from sklearn.ensemble import IsolationForest
iTree = IsolationForest(contamination=0.005)
iTree.fit(CompetitionData_Scale_PCA)
DropVlaue = pd.DataFrame(iTree.predict(CompetitionData_Scale_PCA),index = CompetitionData_Scale_PCA.index)
CompetitionData_Scale_PCA_OutLiners = CompetitionData_Scale_PCA[DropVlaue[0]==1]

In [None]:
CompetitionData_HotCode[DropVlaue[0]==-1]

查看了一下异常值，实话我不认为该把他们去掉

之后我们再看一下聚类的情况

In [None]:
_,_,AgglomerativeCluster_silhouette_re = Visual_K_and_AggCluster_silhouette(CompetitionData_Scale_PCA_OutLiners,2,40)
_,_,KMeans_silhouette_re = Visual_K_and_silhouette(CompetitionData_Scale_PCA,2,40)

In [None]:
x= range(2,40)
y1 = KMeans_silhouette_re
y2 = AgglomerativeCluster_silhouette_re

plt.plot(x,y1,'ro-')
plt.plot(x,y2,'bx-',label = 'AgglomerativeCluster')

plt.legend((u'KMeans', u'Agglomerative Cluster'),loc='best')# sets our legend for our graph.


plt.xlabel('Cluster number')
plt.ylabel('Silhouette')
plt.show()

In [None]:
AgglomerativeCluster = AgglomerativeClustering(linkage='average',n_clusters=4)
AgglomerativeCluster.fit(CompetitionData_Scale_PCA_OutLiners)
AgglomerativeCluster_silhouette = silhouette_score(CompetitionData_Scale_PCA_OutLiners,AgglomerativeCluster.labels_,random_state=0)

pd.DataFrame(AgglomerativeCluster.labels_)[0].value_counts()

最后综合轮廓系数以及我们分类的目的来看，我们选择层次聚类，K=4

In [None]:
AgglomerativeCluster = AgglomerativeClustering(linkage='average',n_clusters=4)
AgglomerativeCluster.fit(CompetitionData_Scale_PCA)

In [None]:
MergeValue = pd.DataFrame(AgglomerativeCluster.labels_,index = CompetitionData_Scale_PCA.index).rename(columns = {0:'Labels'})
ClusterResult_Save = pd.merge(CompetitionData_Scale_PCA,MergeValue,on = 'Id')

In [None]:
ClusterResult_Save.Labels.value_counts()

In [None]:
AgglomerativeCluster.children_

In [None]:
CompetitionData_HotCode[ClusterResult_Save.Labels==2]

## 聚类结果可视化

In [None]:
ClusterResult_Save

In [None]:
colors = np.array(['#DC143C','#FF69B4','#DA70D6','#EE82EE','#8B008B','#9400D3','#8A2BE2','#6A5ACD','#F8F8FF','#191970','#4169E1',
                   '#778899','#F0F8FF','#87CEEB','#B0E0E6','#E1FFFF','#00FFFF','#008B8B','#20B2AA','#00FA9A','#3CB371','#90EE90',
                   '#32CD32','#008000','#7CFC00','#F5F5DC','#FFFFE0','#BDB76B','#F0E68C','#DAA520','#F5DEB3','#FFEFD5','#FAEBD7',
                   '#FFE4C4','#CD853F','#D2691E','#A0522D','#FF4500','#FFE4E1','#F08080','#FF0000','#8B0000','#F5F5F5','#C0C0C0',
                   '#696969',
])

In [None]:
ClusterResult_Save.Labels = colors[ClusterResult_Save.Labels]

In [None]:
fig = plt.figure(figsize=(7,5))
ax = Axes3D(fig)
x = ClusterResult_Save[0]
y = ClusterResult_Save[1]
z = ClusterResult_Save[2]
c = ClusterResult_Save.Labels
ax.scatter(z,x,y,c=c)
#ax.set_zlabel('Z', fontdict={'size': 15, 'color': 'red'})
#ax.set_ylabel('Y', fontdict={'size': 15, 'color': 'red'})
#ax.set_xlabel('X', fontdict={'size': 15, 'color': 'red'})
plt.show()


## Save 

In [None]:
ClusterResult_Save.to_csv('CompetitionClusterResult.csv')

# 导入需要的包 

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
ls

# 导入数据

In [None]:
UserData = pd.read_csv('../input/Users.csv',index_col=0)
UserFollow = pd.read_csv('../input/UserFollowers.csv',index_col=0)
UserOrganization = pd.read_csv('../input/UserOrganizations.csv',index_col=0)
UserAchievements = pd.read_csv('../input/UserAchievements.csv',index_col=0)

## UserData information

In [None]:
UserData.info()

In [None]:
UserData.head(5)

In [None]:
UserFollow.info()

因为UserDatade的Id为用户Id，直接作为index会影响拼接，所以我们重置一下，将用户的index重置，重新生成一个索引。

In [None]:
UserData.reset_index(inplace=True)

## User Origanization information

In [None]:
UserOrganization.head(5)

In [None]:
UserOrganization.info()

## User followers information

In [None]:
UserFollow.head(5)

In [None]:
UserFollow.info()

## User achievements information

In [None]:
UserAchievements.head(5)

In [None]:
UserAchievements.info()

# Data Cleaning
对各类数据进行Clean

## UserData

查看有缺失值的列

In [None]:
UserData.isnull().any()

查看有缺失值的行

In [None]:
UserData[UserData.UserName.isnull()]

In [None]:
# 查看了前5行的数据
UserData[UserData.DisplayName.isnull()].head(5) 

In [None]:
UserData[UserData.DisplayName.isnull()].shape

综合数据特点来看，我们决定删除`UserName`的缺失行，对`DisplayName`的缺失行填补`None`

In [None]:
UserData.DisplayName.fillna('None',inplace=True)

In [None]:
UserData.dropna(inplace=True)

After Clean

In [None]:
UserData.isnull().any()

In [None]:
UserData.head(5)

### ReguisterNumber 时间的可视化表示

In [None]:
TimeAndNumber = pd.DataFrame(UserData['RegisterDate'])

## register Date

将注册时间分解为`年` `月` `日`

In [None]:
regTimeVal = UserData.RegisterDate.str.split('/',expand=True)
regTimeVal.head(5)

In [None]:
UserData.head(5)

In [None]:
regTimeVal[0] = (regTimeVal[0].astype('int')//4+1).astype('object')

In [None]:
regTimeVal.shape

In [None]:
UserData.RegisterDate = regTimeVal[2].astype('str')+regTimeVal[0].astype('str')

## UserFollowers

In [None]:
UserFollow.head(5)

因为我们最后要对`UserData`进行分析，所以在此处我们清洗数据时先对`UserId`进行聚类，之后得出User的关注者数目，之后拼接回原表。

In [None]:
userFollowGroupResult = UserFollow.groupby('UserId')
userFollowNumber = pd.DataFrame(userFollowGroupResult.size())

`userFollowNumber`就是我们要的关注数据

In [None]:
# 进行一些微调，比如Index和列名
userFollowNumber.reset_index(inplace=True)
userFollowNumber.rename(columns={'UserId':'Id',0:'FollowersNumber'},inplace=True)

userFollowNumber.head(5)

因为UserDatade的Id为用户Id，直接作为index会影响拼接，所以我们重置一下，讲用户的index重置，重新生成一个索引。

In [None]:
UserData_add_followerNum = pd.merge(UserData,userFollowNumber,on='Id',how='left')

In [None]:
UserData_add_followerNum.head(5)

In [None]:
UserData_add_followerNum.isnull().any()

In [None]:
UserData_add_followerNum.FollowersNumber.fillna(0,inplace=True)
UserData_add_followerNum.shape

## UserAchievements

In [None]:
UserAchievements.head(5)

In [None]:
UserAchievements[UserAchievements.UserId==368].head(5)

In [None]:
UserAchievements[UserAchievements.AchievementType=='Scripts'].head(5)

In [None]:
UserAchievements.UserId.value_counts().sort_values(ascending=True).head(5)

通过以上数据我们可以看到`UserAchievements`数据由以下特点：
- 每个UserId在UserAchievements中都有三个对应值，这三个对应值表示用户在Scripts、Competitions、Discussion三个方面的成就。
- 只有Competition的Achievemtnts有排名，别的都没有
- Achievements成仙了用户取得对应Tier的时间。

因为我们最后需要的是用户个人的数据，所以我们队数据进行如下处理：

In [None]:
cols = ['UserId','CurrentRanking','HighestRanking','TotalGold','TotalSilver','TotalBronze']
UserAchievements_to_process = UserAchievements[cols]
UserAchievements_to_process.head(5)

In [None]:
UserAchievements_complete = UserAchievements_to_process.groupby('UserId').sum()
UserAchievements_complete.reset_index(inplace=True)
UserAchievements_complete.rename(columns={'UserId':'Id'},inplace=True)
UserAchievements_complete.head(5)

In [None]:
UserData_Follow_Achievements = pd.merge(UserAchievements_complete,UserData_add_followerNum,on = 'Id',how='left')
UserData_Follow_Achievements.head(5)

In [None]:
UserData_Follow_Achievements.isnull().any()

In [None]:
UserData_Follow_Achievements[UserData_Follow_Achievements.UserName.isnull()]

In [None]:
UserData_Follow_Achievements.dropna(inplace=True)

In [None]:
UserData_Follow_Achievements.isnull().any()

## User Origanization

In [None]:
UserOrganization.head(5)

In [None]:
UserOrganization.rename(columns={'UserId':'Id'},inplace=True)
UserOrganization.index.name = 'Index'

In [None]:
UserData_Follow_Organiz_Ach = pd.merge(UserData_Follow_Achievements,UserOrganization,on='Id',how='left')

In [None]:
UserData_Follow_Organiz_Ach_dropDup = UserData_Follow_Organiz_Ach.drop_duplicates(subset=['Id'],keep='first')

In [None]:
UserData_Follow_Organiz_Ach_dropDup.shape

In [None]:
UserData_Follow_Organiz_Ach_dropDup[UserData_Follow_Organiz_Ach_dropDup.OrganizationId.isnull()].head(5)

In [None]:
UserData_Follow_Organiz_Ach_dropDup[UserData_Follow_Organiz_Ach_dropDup.JoinDate.isnull()].shape

In [None]:
UserData_Follow_Organiz_Ach_dropDup.OrganizationId.fillna(0,inplace=True)
UserData_Follow_Organiz_Ach_dropDup.JoinDate.fillna('1/1/2000',inplace=True)

In [None]:
UserData_Follow_Organiz_Ach_dropDup.shape

In [None]:
# UserData_Follow_Organiz_Ach_dropDup.to_csv('userDataAfterEngineering.csv')

##  Competition

In [None]:
CompetitionInf = pd.read_csv('CompetitionClusterResult.csv',index_col=0)
RelationInf = pd.read_csv('TeamRelation.csv',index_col=0)

In [None]:
CompetitionInf.head(5)

In [None]:
CompetitionInf.index.name='CompetitionId'
CompetitionInf.rename(columns={'Labels':'CompetitionType'},inplace=True)
ComMergeValue = pd.DataFrame(CompetitionInf.CompetitionType).reset_index()

In [None]:
ComMergeValue

In [None]:
RelationInf.head(5)

In [None]:
RelationInf.UserId.value_counts()

In [None]:
UserFEResult = pd.merge(ComMergeValue,RelationInf,on='CompetitionId',how='right')

In [None]:
UserFEResult

In [None]:
UserFEResult.isnull().any()

In [None]:
UserFEGroup = UserFEResult.groupby(['UserId','CompetitionType'])

UserFEGroup_DF = pd.DataFrame(UserFEGroup.size()).reset_index()

UserFEGroup_DF.head(5)

In [None]:
Merge_To_User_Value = UserFEGroup_DF.pivot(columns='CompetitionType',index='UserId').fillna(0)
Merge_To_User_Value.head(5)

In [None]:
Merge_To_User_Value.shape

### 拼接User和Competition

In [None]:
Merge_To_User_Value.index.name='Id'

In [None]:
MergeResult = pd.merge(UserData_Follow_Organiz_Ach_dropDup,Merge_To_User_Value,how= 'left',on = 'Id')

In [None]:
MergeResult.dropna(inplace=True)

In [None]:
MergeResult.shape

## Relation

In [None]:
TeamTier = pd.read_csv('TeamLeaderTier.csv',index_col=0)
TeamTier = TeamTier[['LeaderTier','UserId']]
TeamTier.rename(columns={'UserId':'Id'},inplace=True)

In [None]:
TeamTier

In [None]:
LastResult = pd.merge(MergeResult,TeamTier,how = 'left',on = 'Id')

In [None]:
LastResult.isnull().any()

In [None]:
LastResult.fillna(-1,inplace=True)

In [None]:
LastResult.to_csv('User_Relation_Competition.csv')