In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from scipy.stats import kruskal
from sklearn.cluster import SpectralClustering, AffinityPropagation
from sklearn import metrics
from ipywidgets import interact

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Import, clean and explore the data

In [None]:
data = pd.read_csv('/kaggle/input/depression-in-married-couples/Dataset 1.csv')
data.head()

In [None]:
data.isnull().sum()

The data set is complete, no missing values.

Let's eliminate $N_{0}$ because is just an identification numbers

In [None]:
data=data.drop(columns="No")

In [None]:
data_std=pd.DataFrame(StandardScaler().fit_transform(data), index=data.index, columns=data.columns.values.tolist())

sns.set_style(style='dark')
plt.figure(figsize=(16,9))
heatmap = sns.heatmap(data_std.corr(), vmin=-1,vmax=1, annot=False,cmap='viridis')

heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)
plt.show()

#The correlation heatmap will give some important visualization of the data

We can observe to regions of correlation uncorrelated to each other:
* The one with gender,working status, etc, which is poor correlationed
* The one of B1,B2,...,B21 which is strongly correlated

Let's do a Kruskal test to see it. Then we can take a clustering to see if we are able to distinct some regions.

In [None]:
Bi=["B"+str(i) for i in range(1,22)]
col_names = data.columns.values.tolist()
col_names = col_names[0:4]
#We divide de data in two groups:
#one is for the attributes of the subject and the other for the demographic data

In [None]:
K1=kruskal(data[col_names],data[Bi[0:4]])
K2=kruskal(data[col_names],data[Bi[5:9]])
K3=kruskal(data[col_names],data[Bi[10:14]])
K4=kruskal(data[col_names],data[Bi[14:18]])
K5=kruskal(data[col_names],data[Bi[17:21]])
print(K1,"\n",K2,"\n",K3,"\n",K4,'\n',K5,"\n",
      "The Kruskal test mean gives:\n Kurskal result(statistic={}, pvalue={})".format(np.mean([K1[0],K2[0],K3[0],K4[0],K5[0]]),np.mean([K1[1],K2[1],K3[1],K4[1],K5[1]])))
#The Kruskal tests aplied by subgroups and the mean of all of them gives an acceptable
#value of the test for the whole dataset

We observe that there is obviusly a correlation between those two groups with a pvalue of 0.0

## Clustering

In [None]:
af = AffinityPropagation(preference=-50,random_state=42).fit(data)
#This provides the number of necessary clusters

In [None]:
cluster_centers_ID = af.cluster_centers_indices_
clust_labels = af.labels_
n_clust = len(cluster_centers_ID)
print(" Cluster Centers ID: {}".format(cluster_centers_ID),"\n",
     "Clust labels: ", clust_labels,"\n"
     "Number of clusters: {}".format(n_clust))
#Here we can see the centers of the clusters,
#where the affinity propagation method cluster the data and the number of clusters

In [None]:
#Using Spectral Clustering, which is more robust, based on the number of clusters
#with a random state for reproducibility
clust_esp = SpectralClustering(15, random_state=42).fit_predict(data)

(unique, counts)=np.unique(clust_esp, return_counts=True)
print(" Clust label along its number of elements in it:\n",
      list(zip(unique,counts)))

Almost all values are in the first cluster

In [None]:
def clust_scatter(column1,column2):
    with plt.style.context("ggplot"):
        plt.figure(figsize=(10,6))
        plt.scatter(data[column1],data[column2], c=clust_esp, s=100, cmap="viridis")
        plt.xlabel(column1)
        plt.ylabel(column2)
        plt.title('Clustering visualization for '+column1+ " and " +column2)
        plt.show()

In [None]:
interact(clust_scatter, column1=data.columns.values.tolist(),column2=data.columns.values.tolist())
#Interactive plot for column against column to see the clusters that are given by those column values

We found 15 clusters which represents 15 depression scores.

In [None]:
with plt.style.context("ggplot"):
    plt.figure(figsize=(10,6))
    ax = plt.subplot(projection="3d")
    ax.scatter3D(data[col_names].sum(axis=1),data[Bi].sum(axis=1),clust_esp, c=clust_esp, s=100, cmap="viridis")
    ax.view_init(elev=0,azim=45)
    ax.set_xlabel("Attributes")
    ax.set_ylabel("Bi")
    ax.set_zlabel("Depression score")
    ax.set_title('Depression Score based on attributes and Bi')
    
plt.show()
#3D plot for atributes against Bi (demographic effects) with the altitude and color given by de cluster

In [None]:
#The most depressed individuals are
data[clust_esp==14]

We can see that are females with Bachelor and a Flirt Marriage with some differences in demographic effects, Working Status and Status of having a child.