In [1]:
from __future__ import print_function
from datahandling import access_db, get_equip_names, get_dtype_names
from tinydb import Query
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.decomposition import PCA
from time import time
from pandas import DataFrame
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt

In [2]:
db_X = access_db(3, True)

In [3]:
sv_db = access_db(0, True)

In [4]:
db_X.purge()

In [5]:
Q = Query()

In [6]:
ingredients = ['PVC', 'filler', 'FR', 'stabiliser', 'DINP', 'LDH', 'spherical_filler']

In [7]:
equip_names = get_equip_names(sv_db)
equip_names

[u'colour', u'LOI', u'MCC', u'thermomat', u'tensile', u'rheomix', u'ConeCal']

In [8]:
sv_db.remove((Q.sample_number == 35))

[]

Populate X from single value database and store in new db

In [9]:
X = []
tm = time()
d_types_skip = ['epsilon_break_%', 
                'epsilon_max_%', 
                'sigma_break_MPa', 
                'sigma_max_MPa', 
                'E_t_MPa',
                'int_of_abs_err'
               ]

if not db_X.all():
    d_type_descr = []
    for sample_no in range(53):
        sample_no += 1
        
        if sample_no == 35:
            continue
        
        data_p_sample = []

        for ing in ingredients:
            entry = sv_db.search((Q.sample_number == sample_no) &
                                 (Q.ingredient == ing))
            val = entry[0]['value']
            data_p_sample.append(val)
            
            if sample_no == 1:
                d_type_descr.append('MassFrac ' + ing) 

        for en in equip_names:
            d_type_names = get_dtype_names(sv_db, en)
            d_type_names = [i for i in d_type_names if i not in d_types_skip]
            for dtn in d_type_names:
                entry = sv_db.search((Q.sample_number == sample_no) &
                                     (Q.equipment_name == en) &
                                     (Q.data_type == dtn))

                if entry:
                    val = entry[0]['value']
                else:
                    val = None

                data_p_sample.append(val)
                
                if sample_no == 1:
                    d_type_descr.append(en + ' ' + dtn)

        X.append(data_p_sample)
    
    entry = {'X': X,
             'd_type_descr': d_type_descr}
    db_X.insert(entry)

req_time = time() - tm
print(req_time)

95.1270000935


Database has missing values, missing values can either be replaced by mean or incomplete rows excluded from X

In [10]:
X = db_X.all()[0]['X']
impute = False
if impute:
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(X)
    X = imp.transform(X)
else:
    # Removing all incomplete samples
    X = [sample for sample in X if None not in sample]

Do PCA

In [11]:
X_std = StandardScaler().fit_transform(X)

In [12]:
len(X_std)

17

In [13]:
my_pca = PCA(n_components=5)

In [14]:
my_pca.fit(X_std)

PCA(copy=True, n_components=5, whiten=False)

In [15]:
comp = my_pca.components_

Analyse PCA results by looking at principal components with highest explained variance

In [16]:
my_pca.explained_variance_ratio_

array([ 0.25281568,  0.18875688,  0.14802325,  0.11070184,  0.07258498])

In [17]:
descr = db_X.all()[0]['d_type_descr']

In [18]:
comp = [list(c) for c in comp]

In [19]:
d = [descr] + comp

In [20]:
df = DataFrame(data=d).T

In [21]:
prin_comp = 1
top = 6
sort_df = df.reindex(df[prin_comp].abs().sort_values(ascending=False).index)
clipped_df = sort_df.head(n=top)[[0, prin_comp]]
clipped_df

Unnamed: 0,0,1
16,MCC t_HR_peak_1_kJpg,-0.296511
4,MassFrac DINP,-0.289211
23,tensile epsilon_break_%_mean,-0.281056
26,tensile epsilon_max_%_mean,-0.278876
8,LOI Final,0.260714
9,MCC t_HR_kJpg,-0.246594


In [22]:
names = clipped_df[0].tolist()

In [23]:
names

[u'MCC t_HR_peak_1_kJpg',
 u'MassFrac DINP',
 u'tensile epsilon_break_%_mean',
 u'tensile epsilon_max_%_mean',
 u'LOI Final',
 u'MCC t_HR_kJpg']

In [24]:
df_2 = DataFrame(data=X_std, columns=descr)[names]

In [27]:
Axes = scatter_matrix(df_2)#, diagonal='kde')
n = len(df_2.columns)
for x in range(n):
    for y in range(n):
        # to get the axis of subplots
        ax = Axes[x, y]
        # to make x axis name vertical  
#         ax.xaxis.label.set_rotation(90)
        # to make y axis name horizontal 
        ax.yaxis.label.set_rotation(0)
        # to make sure y axis names are outside the plot area
        ax.yaxis.labelpad = 80

size = 7
[plt.setp(item.yaxis.get_majorticklabels(), 'size', size) for item in Axes.ravel()]
[plt.setp(item.xaxis.get_majorticklabels(), 'size', size) for item in Axes.ravel()]
[plt.setp(item.xaxis.get_label(), 'size', size) for item in Axes.ravel()]
[plt.setp(item.yaxis.get_label(), 'size', size) for item in Axes.ravel()]
plt.show()