In [1]:
from __future__ import print_function
from datahandling import access_db, get_equip_names, get_dtype_names
from tinydb import Query
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.decomposition import PCA
from time import time
from pandas import DataFrame, concat
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from numpy import cumsum
from data_processing_pipeline import do_not_score_list
# % matplotlib inline



In [2]:
sv_db = access_db(0, True)

In [3]:
Q = Query()

In [4]:
sv_db.remove((Q.sample_number == 35))

[]

Extract data from db using pandas to construct X

In [5]:
compositions = DataFrame(sv_db.search(Q.ingredient.exists()))
compositions['name'] = compositions.data_type + ' ' + compositions.ingredient
compositions = compositions[['name', 'sample_number', 'value']].pivot(index='sample_number', columns='name', values='value')

In [6]:
measurements = DataFrame(sv_db.search(Q.equipment_name.exists() & Q.data_type.exists()))
measurements['name'] = measurements.equipment_name + ' ' + measurements.data_type
# This will automatically average the different measurements which repeat
measurements = measurements.pivot_table(index='sample_number', columns='name', values='value')

In [7]:
measurements = measurements.drop([u'tensile E_t_MPa_mean', 
                                  u'tensile epsilon_break_%_mean', 
                                  u'tensile epsilon_max_%_mean',
                                  u'tensile sigma_break_MPa_mean',
                                  u'tensile sigma_max_MPa_mean',
                                  u'thermomat int_of_abs_err'
                                 ], axis=1)

In [8]:
alldata = concat([compositions, measurements], axis=1)

alldata.shape

(52, 41)

In [9]:
exclude_inp = True

if exclude_inp:
    use = measurements
else:
    use = alldata
    
X = use.values.tolist()

Database has missing values, missing values can either be replaced by mean or the incomplete rows are excluded from X

In [10]:
impute = False
if impute:
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(X)
    X = imp.transform(X)
else:
    # Removing all incomplete samples
    X = [sample for sample in X if 'nan' not in str(sample)]

Do PCA

In [11]:
X_std = StandardScaler().fit_transform(X)

In [12]:
my_pca = PCA(n_components=len(X_std[0]))

In [13]:
my_pca.fit(X_std)

PCA(copy=True, n_components=34, whiten=False)

Analyse PCA results by looking at principal components with highest explained variance

In [14]:
comp = my_pca.components_

In [15]:
exp_var = my_pca.explained_variance_ratio_
plt.bar([i + 1 for i in range(len(exp_var))], exp_var, 0.8, align='center')
plt.plot([i + 1 for i in range(len(exp_var))], cumsum(exp_var))
plt.title('Skree Plot')
plt.xlabel('Principal Components/Dimensions')
plt.ylabel('Explained Varience')

<matplotlib.text.Text at 0xafe6e10>

In [23]:
len(comp)

17

In [17]:
df = DataFrame(data=comp, columns=use.columns).T

Is this the decomposition matrix?

In [18]:
prin_comp = 0
top = 5
sort_df = df.reindex(df[prin_comp].abs().sort_values(ascending=False).index)
clipped_df = sort_df.head(n=top)[[prin_comp]]
clipped_df

Unnamed: 0_level_0,0
name,Unnamed: 1_level_1
MCC t_HR_peak_1_kJpg,-0.304089
LOI Final,0.281189
tensile epsilon_break_%,-0.280393
tensile epsilon_max_%,-0.276674
ConeCal MARHE_kW_m2,-0.259337


In [19]:
names = clipped_df.index.tolist()

In [20]:
df_plot = use[names]

In [21]:
Axes = scatter_matrix(df_plot)#, diagonal='kde')
n = len(df_plot.columns)
for x in range(n):
    for y in range(n):
        # to get the axis of subplots
        ax = Axes[x, y]
        # to make x axis name vertical  
#         ax.xaxis.label.set_rotation(90)
        # to make y axis name horizontal 
        ax.yaxis.label.set_rotation(0)
        # to make sure y axis names are outside the plot area
        ax.yaxis.labelpad = 50

size = 7
[plt.setp(item.yaxis.get_majorticklabels(), 'size', size) for item in Axes.ravel()]
[plt.setp(item.xaxis.get_majorticklabels(), 'size', size) for item in Axes.ravel()]
[plt.setp(item.xaxis.get_label(), 'size', size) for item in Axes.ravel()]
[plt.setp(item.yaxis.get_label(), 'size', size) for item in Axes.ravel()]

plt.show()