# Model with steps 10,11,12 and mean_combinations

# Chamber A with 2k recipe

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
df = pd.read_csv('./dfA_2k_45.csv')

In [3]:
df.shape

(152, 45)

In [None]:
for col in df.columns:
    print(col)

---

In [4]:
from sklearn.preprocessing import StandardScaler
# object= StandardScaler()

In [5]:
df=pd.get_dummies(df)

In [6]:
target_column = ['metro'] 
predictors = list(set(list(df.columns))-set(target_column))
#dfA_2k[predictors] = dfA_2k[predictors]/dfA_2k[predictors].max()
#dfA.describe()

In [7]:
X = df[predictors]
y = df[target_column]

In [8]:
model_chosen=Ridge
model_params_dict={'alpha':0.1}

# Combinatorial_functions

In [9]:
from pylab import rcParams
from itertools import combinations
from functools import reduce
import operator as op

In [10]:
def rSubset(arr, r): 
    """
    Returns all possbile combinations of the elements of the array considering r elements at a time 
    
    Input Variables:
    ------
    arr               - list consisting of all elements
    r                 - Number of elements considered in the combination at a time
    
    Output Variable:
    ------
    Returns            - list of all possible combinations considering r elements at a time
    """
    return list(combinations(arr, r))

In [11]:
def param_to_error(model,model_params_dict,combination_list,val_percentage):
    """
    Validates the top parameter sets on the validation set and finds train and test error scores for them.
    
    Input Variables:
    ------
    model                   - Model considered
    model_params_dict       - model paramteres passed as a dictionary
    combination_list        - The parameter combination list
    test_percentage         - Percentage of data kept for test set (Unseen data-not used in this function)
    val_percentage          - Percentage of data kept for validation (based on which best model is selected)
    
    Output Variable:
    ------
    Returns  
    
    Returns dataframe with new feature list
    """        
    X=dataframe[combination_list].copy()
    X=np.array(X)
    y=dataframe["metro"] # Target column name
    y=np.array(y)
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,shuffle=False)
    X_train, X_test, y_train, y_test = train_test_split(X_train,y_train,test_size=val_percentage,shuffle=False)
    scaler=StandardScaler()
    scaler.fit(X_train) 
    X_train=scaler.transform(X_train)
    X_test=scaler.transform(X_test)
    model_=model(**model_params_dict)
    model_.fit(X_train,y_train)
    y_pred=model_.predict(X_test)
    return r2_score(y_test,y_pred),stddev_error_deviation(y_test,y_pred)

In [12]:
def stddev_error_deviation(y_true, y_pred):
    """  
    Calculates standard error deviation of inputs
    
    Input Variables:
    ------
    y_true               - actual value of target
    y_pred               - predicted value
    
    Output Variable:
    ------
    returns standard error deviation stddev((y_true - y_pred) / y_true) of the inputs
    """
    return np.std((y_true - y_pred) / y_true)

In [13]:
def ncr(n, r):
    """
    Calculates the number of combinations possible for a given n and r
    Input Variables:
    ------
    n               - Total number of elements
    r               - Number of elements in the combination
    
    Output Variable:
    ------
    Returns calculation of nCr     
    """
    r = min(r, n-r)
    numer = reduce(op.mul, range(n, n-r, -1), 1)
    denom = reduce(op.mul, range(1, r+1), 1)
    return numer // denom  # or / in Python 2


In [14]:
def max_iterations_calc(min_param_combination,max_param_combination,len_param_list):
    """
    Parsing input files from Yield3/Snowflake
    
    Input Variables:
    ------
    dataframe               - pandas dataframe file
    paramtere_list          - list containing all paramter names
    
    Output Variable:
    ------
    Graph                   - Sensor values vs Time plots for all paramters
    """
    sum_=0
    for i in np.arange(min_param_combination,max_param_combination+1):
        sum_=sum_+ncr(len_param_list,i)
    return sum_

In [15]:
param_list=[]
for col in X.columns:
    param_list.append(col)

In [16]:
len(param_list)

45

In [17]:
def combinatorial(data,model,model_params,param_list,max_models=500,val_percentage=0.3,min_param_combination=2,max_param_combination=2):
    """
    Validates the top parameter sets on the validation set and finds train and test error scores for them.
    
    Input Variables:
    ------
    dataframe                   - Dataframe considered 
    parameter_combination_list  - The top parameter combinations list arranged in correspondance to the scores list
    scores_list                 - The scores corrsponding to the ordered parameter combination list
    metrology_column            - The column of dataframe to be considered for correlation
    
    Output Variable:
    ------
    Prints model performance for each parameter set starting from best model order 
    
    Returns dataframe with new feature list
    """    
#     global r2
    global dataframe
    dataframe=data # To be used inside function param_to_error
    #Settings
    pd.set_option('display.max_colwidth', -1)
    plt.rcdefaults()
    rcParams['figure.figsize'] = 6,4

    # Memory initializations
    r2=[]
    error_scores=[]
    stddev_error_scores=[]
    top_comb_sets=[]

    #Function calls
    max_iterations=max_iterations_calc(min_param_combination,max_param_combination,len(param_list))

    iteration_count=0
    print("Number of parameters:",len(param_list))
    print("Number of combinations being tried:",max_iterations,"\n"
          "consisting of", np.arange(min_param_combination,max_param_combination+1), " elements")

    #__________________________________________________________________________________________________________
    #__________________________________________________________________________________________________________

    # Algorithm Starts
    count=0
    for i in np.arange(min_param_combination,max_param_combination+1): # Start from 1 to 15 param combinations
        subset_list=rSubset(param_list, i) # generate nCr combination sets with i elements in it.
        if(iteration_count > 1):
            print("\nBest Subset till ",i-1,"element combination is: ",np.argmax(error_scores),":",\
                  top_comb_sets[np.argmax(error_scores)])
            print("Best R square:",np.max(error_scores).round(5))
        print("\nNext Subset consists of :",i," element combination")
        for subset in subset_list:
            iteration_count=iteration_count+1
            subset=list(subset)
            subset=subset
            rmspe_,stddev_=param_to_error(model,model_params,subset,val_percentage)
            r2.append(rmspe_)
            
#             count+=1
#             print(count,r2)
#             print(len(r2))
            #print(len(top_comb_sets))
            
            if(len(error_scores))<max_models: 
                #add new pamam combination
                top_comb_sets.append(subset)
                error_scores.append(np.round(rmspe_,5))
                stddev_error_scores.append(stddev_)

            elif (rmspe_ > np.min(error_scores)): #why?

                #Remove maximum score param combination
                top_comb_sets.pop(np.argmin(error_scores))
                stddev_error_scores.pop(np.argmin(error_scores))
                pop_=np.argmin(error_scores)
                error_scores.pop(pop_)

                #add new param combination
                top_comb_sets.append(subset)
                error_scores.append(np.round(rmspe_,5))
                stddev_error_scores.append(stddev_)

            if(iteration_count % 5000==0): #why?
                    print("Completed : ",int(((iteration_count/max_iterations)*100)),"%")
                    print("Best R square:",np.max(error_scores).round(5))
                    print("Best combo:",top_comb_sets[np.argmax(error_scores)],"\n")


    print("Done")
    print("Completed : ",int(((iteration_count/max_iterations)*100)),"%")
    print("Best % Deviation:",np.max(error_scores).round(5),":",top_comb_sets[np.argmax(error_scores)+1],"\n")
    #print("Best % Deviation:",np.max(error_scores).round(5),":",top_comb_sets[0:10],"\n")

    return top_comb_sets,error_scores,stddev_error_scores,r2

In [18]:
top_comb_sets,error_scores,stddev_error_scores,r2 = combinatorial(data=df,model=model_chosen,
model_params=model_params_dict,param_list=param_list,max_models=990,val_percentage=0.3,min_param_combination=2,max_param_combination=2)

  pd.set_option('display.max_colwidth', -1)


Number of parameters: 45
Number of combinations being tried: 990 
consisting of [2]  elements

Next Subset consists of : 2  element combination
Done
Completed :  100 %
Best % Deviation: -0.12123 : ['TEOSFlow_mean_12', 'OzoneBackPressure_mean_10'] 



In [19]:
# df1 = pd.DataFrame((top_comb_sets,r2,error_scores),columns =['Sensors', 'R2 value','Error Score'])
# data = pd.DataFrame(sorted_list,r2)
df1 = pd.DataFrame(list(zip(top_comb_sets,r2,error_scores)),columns =['Sensors', 'R2 value','Error Score'])
df1

Unnamed: 0,Sensors,R2 value,Error Score
0,"[ShowerheadHeaterSpacing_mean_10, N2GasFlow_mean_10]",-2.299287,-2.29929
1,"[ShowerheadHeaterSpacing_mean_10, HeaterOutputPowerPercent_mean_12]",-0.245482,-0.24548
2,"[ShowerheadHeaterSpacing_mean_10, CarrierGasLinePressureTEOS2_mean_10]",-5.644238,-5.64424
3,"[ShowerheadHeaterSpacing_mean_10, HeatExchangerTemp1_mean_12]",-1.108283,-1.10828
4,"[ShowerheadHeaterSpacing_mean_10, N2TEOSGasFlow_mean_12]",-0.573894,-0.57389
...,...,...,...
985,"[OzoneBackPressure_mean_10, ThrottleValveAngle_mean_10]",-1.587532,-1.58753
986,"[OzoneBackPressure_mean_10, tool_name_BPPR4D01A2]",-1.369357,-1.36936
987,"[ShowerheadHeaterSpacingOtherSide_mean_10, ThrottleValveAngle_mean_10]",-1.361614,-1.36161
988,"[ShowerheadHeaterSpacingOtherSide_mean_10, tool_name_BPPR4D01A2]",-0.771600,-0.77160


In [20]:
# Sorting by column 'R2 value'
df1=df1.sort_values(by=['R2 value'], ascending=False)
df1

Unnamed: 0,Sensors,R2 value,Error Score
754,"[TEOSFlow_mean_12, NF3GasFlow_mean_10]",-0.121228,-0.12123
424,"[N2TEOSGasFlow_mean_10, NF3GasFlow_mean_10]",-0.123205,-0.12320
748,"[TEOSFlow_mean_12, NF3GasFlow_mean_12]",-0.124332,-0.12433
418,"[N2TEOSGasFlow_mean_10, NF3GasFlow_mean_12]",-0.124510,-0.12451
244,"[N2TEOSGasFlow_mean_12, NF3GasFlow_mean_10]",-0.126737,-0.12674
...,...,...,...
803,"[HeatExchangerTemp2_mean_12, ChamberPressure_mean_10]",-2463.551346,-2463.55135
811,"[HeatExchangerTemp2_mean_12, CarrierGasLinePressureTEOS2_mean_12]",-3995.950483,-3995.95048
535,"[CarrierGasLinePressureTEOS_mean_12, HeatExchangerTemp2_mean_12]",-4072.196772,-4072.19677
760,"[CarrierGasLinePressureTEOS_mean_10, HeatExchangerTemp2_mean_12]",-4183.736051,-4183.73605


In [None]:
data = pd.DataFrame(sorted_list[:100],columns =['Sensor A', 'Sensor B'])
print(len(data))
inference=pd.DataFrame(list(zip(data['Sensor A'].value_counts(),data['Sensor B'].value_counts())),columns =['Sensor A count', 'Sensor B count'])
inference

In [21]:
def parameter_counter(parameter_names_list,top_comb_sets):
    """
    Plots the signal to noise model performance graph for the top n models considered.
    
    Input Variables:
    ------
    parameter_name_list  - Consists of all the parameter names in the model
    top_comb_sets                - list of parameter combination for top n models.

    Output Variable:
    ------
    prints the count of each parameter in the signal-to-noise model performance graph
    """    
    
    # Initialize a pandas dataframe to hold the paramter counter values
    counter=pd.DataFrame(data=np.transpose([parameter_names_list,np.zeros(len(parameter_names_list))]),columns=(["params","count"]))
    counter.index=counter["params"]
    counter["count"]=0
    
    # Count each param in all model
    for i in range(len(top_comb_sets)):
        for j in range(len(top_comb_sets[i])):
            var=top_comb_sets[i][j]
            counter.loc[var,"count"]+=1
    counter.sort_values(by="count",ascending=False,inplace=True)
    counter.index=np.arange(len(counter.index))
    print(counter)
    

In [22]:
# df2=[]
# df2=df1['Sensors'][0:100]
def sort_list(list1, list2):
#     c=0
    zipped_pairs = zip(list2, list1)
 
    z = [x for _, x in sorted(zipped_pairs,reverse=True)]
#     c+=1
#     print(c)
    return z

sorted_list=[]
sorted_list=sort_list(top_comb_sets,r2)

In [23]:
# Print paramter count for the generated models

parameter_names_list=param_list
parameter_counter(parameter_names_list=parameter_names_list,top_comb_sets=list(df1['Sensors'][0:100]))

                                      params  count
0   NF3GasFlow_mean_10                        28   
1   NF3GasFlow_mean_12                        28   
2   TEBFlow_mean_10                           23   
3   HeaterOutputPowerPercent_mean_12          17   
4   HeaterOutputPowerPercent_mean_10          12   
5   TEOSFlow_mean_10                          7    
6   N2TEOSGasFlow_mean_12                     5    
7   N2TEOSGasFlow_mean_10                     5    
8   TEPOFlow_mean_10                          5    
9   O3GasFlow_mean_12                         5    
10  TEOSFlow_mean_12                          5    
11  StepElapsedTime_mean_12                   4    
12  HeatExchangerTemp3_mean_12                4    
13  ThrottleValveAngle_mean_10                4    
14  N2TEBGasFlow_mean_12                      4    
15  ThrottleValveAngle_mean_12                4    
16  HeatExchangerResistivity_mean_10          4    
17  N2TEBGasFlow_mean_10                      4    
18  Showerhe