In [1]:
from numpy import genfromtxt
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

In [2]:
path = "../data/clustering_data.tsv"

In [3]:
df = pd.read_csv(path, delimiter="\t")

In [4]:
df.columns

Index(['energy_100g', 'fat_100g', 'saturated-fat_100g', 'trans-fat_100g',
       'cholesterol_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
       'proteins_100g', 'salt_100g', 'sodium_100g', 'vitamin-a_100g',
       'vitamin-c_100g', 'calcium_100g', 'iron_100g'],
      dtype='object')

In [5]:
df.describe()

Unnamed: 0,energy_100g,fat_100g,saturated-fat_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,vitamin-a_100g,vitamin-c_100g,calcium_100g,iron_100g
count,264103.0,246947.0,232582.0,143310.0,144097.0,246656.0,247992.0,202664.0,262908.0,258532.0,258485.0,137570.0,140891.0,141086.0,140485.0
mean,1141.734,12.744683,5.128742,0.073474,0.02007,32.016909,15.964292,2.863981,7.079194,2.018999,0.795024,0.000397,0.023367,0.125177,0.003741
std,6411.181,17.586484,8.016221,1.540159,0.358053,29.705222,22.280895,12.821718,8.401575,127.518382,50.208649,0.073274,2.23626,3.317841,0.216165
min,0.0,0.0,0.0,-3.57,0.0,0.0,-17.86,-6.7,-800.0,0.0,0.0,-0.00034,-0.0021,0.0,-0.00026
25%,377.0,0.0,0.0,0.0,0.0,5.91,1.3,0.0,0.7,0.0635,0.025,0.0,0.0,0.0,0.0
50%,1100.0,5.0,1.79,0.0,0.0,20.41,5.7,1.5,4.76,0.58,0.228346,0.0,0.0,0.035,0.00101
75%,1674.0,20.0,7.14,0.0,0.02,58.18,24.0,3.6,10.0,1.3716,0.54,0.000107,0.0037,0.106,0.0024
max,3251373.0,714.29,550.0,369.0,95.238,2916.67,3520.0,5380.0,430.0,64312.8,25320.0,26.7,716.9811,694.737,50.0


In [6]:
# Number of all values
df.size

3984285

In [7]:
df.shape

(265619, 15)

In [8]:
df.isnull().values.any()

True

In [9]:
# NaN values
# 22.989745964457864
df.isnull().sum().sum()

915977

In [10]:
# Number of non NAN values
df.notnull().sum().sum()

3068308

In [11]:
# Number of outliers
df[df > 100].notnull().sum().sum()

246170

In [12]:
# Imputation of missing values fill in missing values with median
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
imputer = imp.fit_transform(df.as_matrix())

In [13]:
# Scaling data with outliers
robust_scale = preprocessing.robust_scale(imputer, axis=0)

In [14]:
robust_scale

array([[  0.88399072,   1.26380697,   4.22397476, ...,   0.0214    ,
         -2.91666667,   1.03703704],
       [  0.65042537,   0.68954424,  -0.28233438, ...,   0.        ,
          3.        ,   1.03703704],
       [  1.1136891 ,   2.79571046,   0.56309148, ...,   0.        ,
          9.        ,  15.2962963 ],
       ..., 
       [ -0.83449343,  -0.25737265,  -0.25078864, ...,   0.        ,
          0.        ,   0.        ],
       [ -0.85073473,  -0.26809651,  -0.28233438, ...,   0.        ,
         -2.91666667,  -3.74074074],
       [  0.76720804,  -0.26809651,   0.        , ...,   0.        ,
          0.        ,   0.        ]])

In [15]:
# Convert NumpyArray to Pandas DataFrame
columns = ['energy_100g', 'fat_100g', 'saturated-fat_100g', 'trans-fat_100g',
       'cholesterol_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
       'proteins_100g', 'salt_100g', 'sodium_100g', 'vitamin-a_100g',
       'vitamin-c_100g', 'calcium_100g', 'iron_100g']
clean_data = pd.DataFrame(data=robust_scale,columns=columns)

In [16]:
clean_data.describe()

Unnamed: 0,energy_100g,fat_100g,saturated-fat_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,vitamin-a_100g,vitamin-c_100g,calcium_100g,iron_100g
count,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0
mean,0.032092,0.386073,0.461116,0.039642,0.010888,0.216583,0.47701,0.3469,0.24683,1.097481,1.097557,0.000205,0.012394,3.991515,5.34967
std,4.944207,0.915404,1.195843,1.131881,0.26391,0.578334,1.079142,3.738222,0.899123,98.578534,98.57853,0.052733,1.628714,201.539948,582.268342
min,-0.850735,-0.268097,-0.282334,-3.57,0.0,-0.410128,-1.172723,-2.733333,-86.533333,-0.454474,-0.454474,-0.00034,-0.0021,-2.916667,-4.703704
25%,-0.556071,-0.262735,-0.282334,0.0,0.0,-0.276098,-0.203584,-0.5,-0.436559,-0.399624,-0.399624,0.0,0.0,-0.333333,-0.444444
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.443929,0.737265,0.717666,0.0,0.0,0.723902,0.796416,0.5,0.563441,0.600376,0.600376,0.0,0.0,0.666667,0.555556
max,2513.745553,38.031635,86.468454,369.0,95.238,58.198734,174.927825,1792.833333,45.724731,50393.52766,50393.52766,26.7,716.9811,57891.833333,185181.444444


In [17]:
# Scaling sparse data with maxabscale
maxabs_scaled = preprocessing.maxabs_scale(imputer, axis=0)

In [18]:
# Convert NumpyArray to Pandas DataFrame
columns = ['energy_100g', 'fat_100g', 'saturated-fat_100g', 'trans-fat_100g',
       'cholesterol_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
       'proteins_100g', 'salt_100g', 'sodium_100g', 'vitamin-a_100g',
       'vitamin-c_100g', 'calcium_100g', 'iron_100g']
maxabs_data = pd.DataFrame(data=maxabs_scaled,columns=columns)

In [19]:
maxabs_data.describe()

Unnamed: 0,energy_100g,fat_100g,saturated-fat_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,vitamin-a_100g,vitamin-c_100g,calcium_100g,iron_100g
count,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0,265619.0
mean,0.000351,0.01708,0.00857,0.000107,0.000114,0.010693,0.004342,0.000472,0.008819,3.1e-05,3.1e-05,8e-06,1.7e-05,0.000119,4.9e-05
std,0.001966,0.023901,0.013785,0.003067,0.002771,0.009868,0.006159,0.002085,0.010452,0.001956,0.001956,0.001975,0.002272,0.003481,0.003144
min,0.0,0.0,0.0,-0.009675,0.0,0.0,-0.005074,-0.001245,-1.0,0.0,0.0,-1.3e-05,-3e-06,0.0,-5e-06
25%,0.000117,0.00014,0.0,0.0,0.0,0.002287,0.000457,0.0,0.000875,1e-06,1e-06,0.0,0.0,4.5e-05,1.8e-05
50%,0.000338,0.007,0.003255,0.0,0.0,0.006998,0.001619,0.000279,0.00595,9e-06,9e-06,0.0,0.0,5e-05,2e-05
75%,0.000515,0.02625,0.011527,0.0,0.0,0.019349,0.006165,0.000558,0.0125,2.1e-05,2.1e-05,0.0,0.0,6.2e-05,2.3e-05
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5375,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
# Clustering with Kmeans
k_means = KMeans(n_clusters=8).fit(maxabs_data)

In [21]:
centers = k_means.cluster_centers_

In [22]:
for c in centers:
    print(c)

[  1.31031559e-04   3.44570101e-03   2.31460649e-03   4.91840392e-05
   2.63860312e-05   4.49239290e-03   2.31339075e-03   2.91491689e-04
   3.18472818e-03   3.45257972e-05   3.45288206e-05   5.28392564e-06
   1.88821216e-05   6.85667502e-05   3.19206128e-05]
[  5.13805406e-04   2.91262613e-02   1.30871456e-02   2.85371457e-04
   1.60014188e-04   1.58404620e-02   5.89109279e-03   5.39954858e-04
   8.21710062e-03   1.91410603e-05   1.91413503e-05   2.28990251e-06
   4.52031993e-06   8.03637589e-05   5.22063140e-05]
[  1.01184578e-03   1.24643361e-01   2.16206085e-02   3.84272389e-04
   7.37643680e-05   8.72392590e-04   9.95388693e-04   3.19922305e-04
   1.52077983e-03   6.80807790e-06   6.80803962e-06   1.51491595e-06
   1.65845359e-07   4.99832707e-05   2.36795753e-05]
[  4.32928418e-04   3.77822221e-03   1.72703815e-03   6.23824377e-05
   1.73802228e-05   2.51890966e-02   9.44724683e-03   6.28993613e-04
   6.96221841e-03   4.20117171e-05   4.20122007e-05   3.26644621e-06
   4.43721892

In [23]:
from sklearn.decomposition import PCA

In [24]:
pca = PCA(n_components=2)

In [25]:
reduced_data = PCA(n_components=2).fit_transform(maxabs_data)

In [26]:
k_pca = KMeans(n_clusters=8).fit(reduced_data)

In [27]:
k_pca

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [28]:
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

In [29]:
# Obtain labels for each point in mesh. Use last trained model.
Z = k_pca.predict(np.c_[xx.ravel(), yy.ravel()])

In [30]:
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

<matplotlib.image.AxesImage at 0x102f839e8>

In [31]:
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = k_pca.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
          'Centroids are marked with white cross')

<matplotlib.text.Text at 0x102f667b8>

In [32]:
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())

([], <a list of 0 Text yticklabel objects>)

In [33]:
plt.show()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte