<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Intégration-des-données" data-toc-modified-id="Intégration-des-données-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Intégration des données</a></span></li><li><span><a href="#Data-encoding-of-object-columns" data-toc-modified-id="Data-encoding-of-object-columns-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data encoding of object columns</a></span></li><li><span><a href="#Missing-values" data-toc-modified-id="Missing-values-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Missing values</a></span></li><li><span><a href="#ACP" data-toc-modified-id="ACP-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>ACP</a></span></li><li><span><a href="#Création-de-cluster" data-toc-modified-id="Création-de-cluster-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Création de cluster</a></span><ul class="toc-item"><li><span><a href="#KMeans-non-normalisé" data-toc-modified-id="KMeans-non-normalisé-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>KMeans non normalisé</a></span></li><li><span><a href="#KMeans-normalisé" data-toc-modified-id="KMeans-normalisé-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>KMeans normalisé</a></span></li><li><span><a href="#Clustering-en-prenant-les-features-de-l-'ACP" data-toc-modified-id="Clustering-en-prenant-les-features-de-l-'ACP-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Clustering en prenant les features de l 'ACP</a></span></li></ul></li><li><span><a href="#Creation-de-la-colonne-cluster" data-toc-modified-id="Creation-de-la-colonne-cluster-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Creation de la colonne cluster</a></span></li><li><span><a href="#Analyse-des-clusters" data-toc-modified-id="Analyse-des-clusters-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Analyse des clusters</a></span><ul class="toc-item"><li><span><a href="#Musique" data-toc-modified-id="Musique-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Musique</a></span></li><li><span><a href="#MOVIE" data-toc-modified-id="MOVIE-7.2"><span class="toc-item-num">7.2&nbsp;&nbsp;</span>MOVIE</a></span></li><li><span><a href="#HOBBIES" data-toc-modified-id="HOBBIES-7.3"><span class="toc-item-num">7.3&nbsp;&nbsp;</span>HOBBIES</a></span></li><li><span><a href="#PHOBIAS-&amp;-HEALTH-HABITS" data-toc-modified-id="PHOBIAS-&amp;-HEALTH-HABITS-7.4"><span class="toc-item-num">7.4&nbsp;&nbsp;</span>PHOBIAS &amp; HEALTH HABITS</a></span></li><li><span><a href="#PERSONALITY" data-toc-modified-id="PERSONALITY-7.5"><span class="toc-item-num">7.5&nbsp;&nbsp;</span>PERSONALITY</a></span></li><li><span><a href="#SPENDING-HABITS" data-toc-modified-id="SPENDING-HABITS-7.6"><span class="toc-item-num">7.6&nbsp;&nbsp;</span>SPENDING HABITS</a></span></li><li><span><a href="#DEMOGRAPHICS" data-toc-modified-id="DEMOGRAPHICS-7.7"><span class="toc-item-num">7.7&nbsp;&nbsp;</span>DEMOGRAPHICS</a></span></li></ul></li></ul></div>

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
sns.set(rc={'figure.figsize':(35,15)})
matplotlib.rc('figure', figsize=(15, 7))
matplotlib.rc('xtick', labelsize=17) 
matplotlib.rc('ytick', labelsize=17) 
matplotlib.rc('axes', titlesize=17)
from mpl_toolkits import mplot3d

In [2]:
import copy  
def encoding(data):
    #colonnes catégorielles
    df = copy.deepcopy(data)
    for i in df.select_dtypes(include=['object']).columns:
        list_unique = set(df[i].unique())
        dict_pro = dict(zip(list_unique,np.arange(len(list_unique))))
        df[i] = df[i].map(dict_pro)
    return df

def plot_acp(data,pca,i):
    fig, ax = plt.subplots(figsize=(14, 5))
    sns.set(font_scale=1)
    plt.step(range(data.shape[1]), pca.explained_variance_ratio_.cumsum(), where='mid',
         label='cumulative explained variance')
    sns.barplot(np.arange(1,data.shape[1]+1), pca.explained_variance_ratio_, alpha=0.5, color = 'g',
            label='individual explained variance')
    plt.xlim(0, data.shape[1]/i)

    ax.set_xticklabels([s if int(s.get_text())%5 == 0 else '' for s in ax.get_xticklabels()], rotation=90)

    plt.ylabel('Explained variance', fontsize = 14)
    plt.xlabel('Principal components', fontsize = 14)
    plt.legend(loc='best', fontsize = 13); 

# Intégration des données

In [3]:
column = pd.read_csv('../input/columns.csv')

In [4]:
column.head()

In [5]:
column.shape

In [6]:
data = pd.read_csv('../input/responses.csv')

In [7]:
data.head()

In [8]:
data.shape

# Data encoding of object columns

In [9]:
data.select_dtypes(include=['object']).columns

In [10]:
data = encoding(data)

In [11]:
data.head()

# Missing values

In [12]:
#data = data.fillna(0)
data = data.dropna()

In [13]:
data.shape

# ACP

In [14]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(data)
pca_samples = pca.transform(data)

In [15]:
plot_acp(data,pca,4)

In [16]:
plt.scatter(pca_samples[:, 0], pca_samples[:, 1])
plt.title('Distribution des deux premières features de l ACP')
plt.show()

In [17]:
ax = plt.axes(projection='3d')
ax.scatter3D(pca_samples[:, 0], pca_samples[:, 1], pca_samples[:, 2])
plt.title('Distribution des trois premières features de l ACP')
plt.show()

** Normalisation **

In [18]:
from sklearn.preprocessing import StandardScaler
data_scale = StandardScaler().fit_transform(data)

In [19]:
from sklearn.decomposition import PCA
pcaS = PCA()
pcaS.fit(data_scale)
pca_samples_scale = pcaS.transform(data_scale)

In [20]:
plot_acp(data_scale,pcaS,1)

In [21]:
plt.scatter(pca_samples_scale[:, 0], pca_samples_scale[:, 1])
plt.title('Distribution des deux premières features de l ACP')
plt.show()

In [22]:
ax = plt.axes(projection='3d')
ax.scatter3D(pca_samples_scale[:, 0], pca_samples_scale[:, 1], pca_samples_scale[:, 2])
plt.title('Distribution des trois premières features de l ACP')
plt.show()

# Création de cluster

## KMeans non normalisé

In [23]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

for n_clusters in range(2,10):
    kmeans = KMeans(init='k-means++', n_clusters = n_clusters, n_init=30)
    kmeans.fit(data)
    clusters = kmeans.predict(data)
    silhouette_avg = silhouette_score(data, clusters)
    print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)

In [24]:
kmeans = KMeans(init='k-means++', n_clusters = 3, n_init=30)
kmeans.fit(data)
clusters_ok = kmeans.predict(data)
pd.Series(clusters_ok).value_counts()

## KMeans normalisé

In [25]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

for n_clusters in range(2,10):
    kmeans = KMeans(init='k-means++', n_clusters = n_clusters, n_init=30)
    kmeans.fit(data_scale)
    clusters = kmeans.predict(data_scale)
    silhouette_avg = silhouette_score(data_scale, clusters)
    print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)

## Clustering en prenant les features de l 'ACP

In [26]:
from sklearn.cluster import DBSCAN
list_score=[]
for i in range(1,30):
    kmeans = KMeans(init='k-means++', n_clusters = 3, n_init=30)
    kmeans.fit(pca_samples[:,0:i])
    clusters = kmeans.predict(pca_samples[:,0:i])
    list_score.append(silhouette_score(pca_samples[:,0:i], clusters))

In [27]:
fig, ax = plt.subplots()
plt.plot(np.arange(1,30),list_score)
ax.set_xticks(np.arange(1,30))
plt.title('Silhouette score en fonction du nombre de composantes choisies')
plt.show()
print('MAX :',round(max(list_score),2),'de silhouette score pour les',
      list_score.index(max(list_score))+1,'premières composantes.')

In [28]:
#Clusters choisies : clusters construit avec KNN et la première composante principale

In [29]:
kmeans = KMeans(init='k-means++', n_clusters = 3, n_init=30)
kmeans.fit(pca_samples[:,0:1])
clusters_ok = kmeans.predict(pca_samples[:,0:1])
pd.Series(clusters_ok).value_counts()

# Creation de la colonne cluster

In [30]:
data['cluster']=clusters_ok
data.head()

In [31]:
data_cluster = data.groupby(['cluster']).mean()
data_cluster

# Analyse des clusters

## Musique

In [32]:
data_cluster.iloc[:,0:19].T.plot(kind='bar')
plt.tick_params(labelsize=15)
plt.show()

## MOVIE

In [33]:
data_cluster.iloc[:,19:31].T.plot(kind='bar')
plt.tick_params(labelsize=15)
plt.show()

## HOBBIES

In [34]:
data_cluster.iloc[:,31:47].T.plot(kind='bar')
plt.tick_params(labelsize=15)
plt.show()

In [35]:
data_cluster.iloc[:,47:63].T.plot(kind='bar')
plt.tick_params(labelsize=15)
plt.show()

## PHOBIAS & HEALTH HABITS

In [36]:
data_cluster.iloc[:,63:76].T.plot(kind='bar')
plt.tick_params(labelsize=15)
plt.show()

## PERSONALITY

In [37]:
data_cluster.iloc[:,76:90].T.plot(kind='bar')
plt.tick_params(labelsize=15)
plt.show()

In [38]:
data_cluster.iloc[:,90:105].T.plot(kind='bar')
plt.tick_params(labelsize=15)
plt.show()

In [39]:
data_cluster.iloc[:,105:119].T.plot(kind='bar')
plt.tick_params(labelsize=15)
plt.show()

In [40]:
data_cluster.iloc[:,119:133].T.plot(kind='bar')
plt.tick_params(labelsize=15)
plt.show()

## SPENDING HABITS

In [41]:
data_cluster.iloc[:,133:140].T.plot(kind='bar')
plt.tick_params(labelsize=15)
plt.show()

## DEMOGRAPHICS

In [42]:
data_cluster.iloc[:,140:143].T.plot(kind='bar')
plt.tick_params(labelsize=15)
plt.show()

In [43]:
data_cluster.iloc[:,143:].T.plot(kind='bar')
plt.tick_params(labelsize=15)
plt.show()