In [None]:
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

import pandas as pd
import xlrd
import numpy as np
from ipywidgets import *
from ipywidgets import interact,fixed
import pdb
import matplotlib.pyplot as plt

import plotly as py
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import cufflinks as cf

init_notebook_mode(connected=True)
pd.set_option('display.max_columns', 150)
pd.options.display.max_rows = 150

In [None]:
def plot_pearson_correlation(data=None,lookup=None, target_col = 'Dicke'):
    df = data
    
    x_cols = [col for col in df.columns if col in lookup]
              #['Verhältnis HS in MB Einlage','Geschwindigkeit KM m/min','Zug SSW m/min',
              #'Zug 1.-2. Presse m/min','Zug 2.-3. Presse m/min ','Zug 3. Presse - VTG m/min']]
    labels = []
    values = []
    for col in x_cols:
        labels.append(col)
        values.append(np.corrcoef(df[col].values, df[target_col].values)[0,1])
    ind = np.arange(len(labels))
    width = 0.9
    fig, ax = plt.subplots(figsize=(5,5))
    plt.grid(True)
    rects = ax.barh(ind, np.array(values), color='y')
    ax.set_yticks(ind+((width)/2.))
    ax.set_yticklabels(labels, rotation='horizontal')
    ax.set_xlabel("Correlation coefficient")
    ax.set_title("Correlation coefficient")
    #autolabel(rects)
    plt.show()
    

def plot_timeseries(data=None,Sorte=None,lookup=None,clip=False,file=None,cluster=None,width = 2200, height = 650):
    if not (Sorte is None):
        df = data[data.Sorte==Sorte]
    else:
        df = data
    if not (lookup is None):
        x_cols = [col for col in df.columns if col in lookup]
    else:
        x_cols = data.columns
    fig = tools.make_subplots(rows=1, cols=1, print_grid=False)
    for col in x_cols:
        y = df[col]
        mean = y.mean()
        sd = y.std()
        lower = mean-5*(sd)
        upper = mean+5*(sd)
        if clip==True:
            #print('lower: ' + str(lower) + ', upper: ' + str(upper))
            print(col + ' SD:' + str(sd))
            y.clip(lower,upper,inplace = True)
        df[col] = y
    if not (cluster is None):
        df = data[data.Kmeans_cluster==cluster]
    cf.set_config_file(offline=True, world_readable=True)
    df = df[x_cols]
    df.iplot(kind='scatter',dimensions=(width, height),filename=file, margin=(30,30,30,30))
    
    
def get_data(data=None,Sorte=None,lookup=None):
    if not (Sorte is None):
        df = data[data.Sorte==Sorte]
    else:
        df = data
    x_cols = [col for col in df.columns if col in lookup]
    for col in x_cols:
        y = df[col]
        mean = y.mean()
        sd = y.std()
        lower = mean-5*(sd)
        upper = mean+5*(sd)
        df[col] = y  
    df = df[x_cols]
    return df
    #df.plot(figsize = (width,height))

In [None]:
df = pd.read_pickle('Faserstoff_final.pkl')
data_final = df
lookup = pd.read_excel('Daten_f_viesha.xlsx',sheet_name='lookup')
lookup['Kategorie'] = lookup.Kategorie.fillna(method='ffill')
exclude_cols = ['Feuchte','IGT','PPS','Spaltfestigkeit','Sampler status','Tambour','Tambournummer']
lookup = lookup[~lookup.Parameter.isin(exclude_cols)]

In [None]:
sorte_group = pd.DataFrame(data_final.groupby(['Sorte'])['Sorte'].count())
plt.figure(num=None, figsize=(18, 10))
plt.barh(sorte_group.index,sorte_group.Sorte)
plt.grid(True)
print('Most produced: ' + sorte_group.sort_values(by='Sorte',ascending =False).head(1).index.values)
print('2nd most produced: ' + sorte_group.sort_values(by='Sorte',ascending =False).head(2).tail(1).index.values)
print('3rd most produced: ' + sorte_group.sort_values(by='Sorte',ascending =False).head(3).tail(1).index.values)

In [None]:
kmeans_data = data_final.drop(['Sorte'],axis=1).fillna(method='ffill')
#kmeans_data.isnull().any()
kmeans = KMeans(n_clusters=3, n_init = 100, verbose = 5, random_state = 2018, n_jobs = -1)  
kmeans.fit(kmeans_data)  
#print(kmeans.cluster_centers_)  
data_final['Kmeans_cluster'] = kmeans.labels_
data_final.describe()

# goodness = 0.1*'Steifigkeit längs' + 'Steifigkeit quer' - 'L/Q-Verhältnis'

In [None]:
include = pd.DataFrame(lookup.Parameter[lookup.Kategorie=='Faserdaten'])
exclude = ['BatchId','Shive batchid']
include = include[~include.Parameter.isin(exclude)]
include = include[include.Parameter.str.contains('fraction 4')]
include = include[~include.Parameter.str.contains('Shiv')]
# Add goodness
include = include.append({'Parameter': 'goodness'},ignore_index=True)
include = include.append({'Parameter': 'L/Q-Verhältnis'},ignore_index=True)
include = include.append({'Parameter': 'Steifigkeit längs'},ignore_index=True)
include = include.append({'Parameter': 'Steifigkeit quer'},ignore_index=True)
plot_timeseries(data=data_final,Sorte='41-1-2-100-300', lookup = include.values, clip=False, file="Maschine", width = 1200,height = 600)

In [None]:
c0 = plot_timeseries_plt(data=data_final,Sorte='41-1-2-100-300', lookup = include.values, clip=False, cluster=0)
#c0.plot(figsize=(16,10))
c1 = plot_timeseries_plt(data=data_final,Sorte='41-1-2-100-300', lookup = include.values, clip=False, cluster=1)
c2 = plot_timeseries_plt(data=data_final,Sorte='41-1-2-100-300', lookup = include.values, clip=False, cluster=2)


# Highest sorte plot

In [None]:
sortenrein = get_data(data=data_final,Sorte='41-1-2-100-300', lookup = include.values).reset_index().drop(['Datum'],axis = 1)

plot_timeseries(data=sortenrein)

# corrcoeff with steifigkeit quer

In [None]:
df = sortenrein.dropna()
x_cols =  df.columns
target_col = 'Steifigkeit quer'

labels = []
values = []
for col in x_cols:
    labels.append(col)
    values.append(np.corrcoef(df[col].values, df[target_col].values)[0,1])
ind = np.arange(len(labels))
width = 0.9
fig, ax = plt.subplots(figsize=(5,5))
plt.grid(True)
rects = ax.barh(ind, np.array(values), color='y')
ax.set_yticks(ind+((width)/2.))
ax.set_yticklabels(labels, rotation='horizontal')
ax.set_xlabel("Correlation coefficient")
ax.set_title("Correlation coefficient")
#autolabel(rects)
plt.show()

sortenrein2 = get_data(data=data_final,Sorte='41-1-2-100-275', lookup = include.values).reset_index().drop(['Datum'],axis = 1)
interact(plot_pearson_correlation
         , data = fixed(sortenrein2.dropna())
         , lookup = fixed(sortenrein2.columns)
         , target_col= fixed('Steifigkeit quer'));

sortenrein3 = get_data(data=data_final,Sorte='41-1-2-100-250', lookup = include.values).reset_index().drop(['Datum'],axis = 1)
interact(plot_pearson_correlation
         , data = fixed(sortenrein2.dropna())
         , lookup = fixed(sortenrein2.columns)
         , target_col= fixed('Steifigkeit quer'));

# Highest sorte rolling mean plot

In [None]:
include = include[(include.Parameter != 'goodness') & (include.Parameter != 'L/Q-Verhältnis') ]
sortenrein = get_data(data=data_final,Sorte='41-1-2-100-300', lookup = include.values).reset_index().drop(['Datum'],axis = 1)
sortenrein=sortenrein.rolling(10).mean()
sortenrein['Steifigkeit quer'] = sortenrein['Steifigkeit quer']*2
sortenrein['Form fraction 4'] = sortenrein['Form fraction 4']*3.5
sortenrein['Len fraction 4'] = sortenrein['Len fraction 4']*8
plot_timeseries(data=sortenrein)