In [None]:
#import files that are required for reading the data. 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
#plt.figure(figsize=(16,5))

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
import os
#print(os.listdir('../input'))

In [None]:
# create datafile

df= pd.read_csv('../input/electric-motor-temperature/pmsm_temperature_data.csv')
df.head()


# Context
The dataset comprises several sensor data collected from a permanent magnet synchronous motor (PMSM) deployed on a test bench. The PMSM represents a german OEM's prototype model. Test bench measurements were collected by the LEA department at Paderborn University. This dataset is mildly anonymized.
Content
All recordings are sampled at 2 Hz. The dataset consists of multiple measurement sessions, which can be distinguished from each other by column "profile_id". A measurement session can be between one and six hours long.
The motor is excited by hand-designed driving cycles denoting a reference motor speed and a reference torque. Currents in d/q-coordinates (columns "i_d" and i_q") and voltages in d/q-coordinates (columns "u_d" and "u_q") are a result of a standard control strategy trying to follow the reference speed and torque. Columns "motor_speed" and "torque" are the resulting quantities achieved by that strategy, derived from set currents and voltages.
Most driving cycles denote random walks in the speed-torque-plane in order to imitate real world driving cycles to a more accurate degree than constant excitations and ramp-ups and -downs would.
Acknowledgements
Several publications leveraged the setup of the PMSM in the Paderborn University Lab:


Inspiration
The most interesting target features are rotor temperature ("pm"), stator temperatures ("stator_*") and torque. Especially rotor temperature and torque are not reliably and economically measurable in a commercial vehicle.
Being able to have strong estimators for the rotor temperature helps the automotive industry to manufacture motors with less material and enables control strategies to utilize the motor to its maximum capability. A precise torque estimate leads to more accurate and adequate control of the motor, reducing power losses and eventually heat build-up.

(https://www.kaggle.com/wkirgsn/electric-motor-temperature)

## Classification Analysis Objective

1. Using train data change algorithm classify based on profile id's?
2. Identify critical hyperparameters for KNN similarly analysis can be conducted for other algorithms
3. Compare algorithms using confusion matrix, time taken to run analysis, accuracy, balanced accuracy. 
4. recommend algorithm for classification analysis. 

## Data Exploration and cleaning

In [None]:
df.info()

In [None]:
# function to filter dataframe based on profile id. 

def profile_id_df(dataframe, prof_id):
    '''
    Input:
    dataframe = Pandas dataframe 
    profile id = # profile id number out of df['profile_id'].unique()
    
    Output:
    filtered dataframe for a given profile id
    '''
       
    
    return dataframe.loc[dataframe['profile_id'] == prof_id]

In [None]:
profile_list = df['profile_id'].unique()
profile_list


#lets find out how many rows of data is there for each profile id. To estimate complexity of decision tree analysis

rows = []

for n in profile_list:
    rows.append((profile_id_df(df,n)).shape[0])
#print(rows)


## create df using above two lists

d_dict = {'profile_id': profile_list, 'rows': rows}

df_shape = pd.DataFrame(d_dict)

df_shape.head()

In [None]:
#filter the df_shape dataframe with rows >=35000 (The number 15000 is an arbitrary # picked, just to reduce the computing power)

df_short = df_shape.loc[df_shape['rows'] >= 30000]
df_short.reset_index(drop=True, inplace = True)
df_short.head()

In [None]:
df_short.nlargest(10, 'rows')['profile_id'].values

In [None]:
#top 10 profile ids which can be used for classifying the data. 
df_short

In [None]:
# filter the dataframe based on top 10 profile id's
prof_id = list(df_short['profile_id'].values)

df_prof = df[df.profile_id.isin(prof_id)]
print(df_prof.shape)


In [None]:
#Data prep. Split the data into X and y input values

X_filt= df_prof.drop(['profile_id'],axis =1)
y_filt = df_prof['profile_id']

In [None]:
## Classification analysis

In [None]:
#import files
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,f1_score,precision_score
from sklearn.metrics import recall_score,balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_filt, y_filt, test_size = 0.3, random_state = 0)

In [None]:
## Functions to run evulation analysis. 

In [None]:
# function to create input dataframe and compare against various classification models. This serves as input. 

def labels_list(y_test):
    '''
    Input:- 
    y_train with labels of classification
    
    Output
    Sorted list of labels arranged in ascending order to be used in for confusion matrix. 
    
    '''
    df_test = pd.DataFrame(y_test.value_counts())
    df_test.sort_index(ascending=True, inplace= True)
    
    return df_test

In [None]:
# function to capture the diagonal values of the confusion matrix

def true_pred(model, confusion_matrix, y_test):
    '''
    Input:- 
    confusion_matrix = is a np.ndarray
    model = string name of the model. 
    y_test = classification label series
    
    Output:- 
    diagonal values of confusion matrix (true predictions) in dataframe 
    
    '''
    test = np.matrix(confusion_matrix)
    n,m = test.shape
    
    # get label names from y_test column
    labels = sorted(y_test.unique())
    #print(labels)
    # list of values 
    values = []
    
    if n == len(labels):
        for i in range(m):
            values.append(test[i,i])
    else :
        print('The lengths of y_test does not match with confusion matrix shape')
    
    #print(values)
    data = { model: values}
    #print(data)
    df= pd.DataFrame(data=data, index= labels)
    
    return df

In [None]:
#function to capture all the numerical scores of the model. 

def algor_scores(model, y_pred, y_test, time_name):
    acc_score = accuracy_score(y_pred, y_test)
    bal_acc_score= balanced_accuracy_score(y_pred, y_test)

    
    row_label = ['accuracy', 'balance_accuracy', 'time']
    row_values = [acc_score, bal_acc_score, time_name ]
    
    data_score = { model: row_values}
    
    df_data = pd.DataFrame(data= data_score, index= row_label)
    
    return df_data


In [None]:
#raw data of y_test, each model should have values of y_pred as close as to raw_data. The accuracy and precision scores depend upon these predication along with confusion matrix. 
Raw_data = labels_list(y_test)
Raw_data

In [None]:
## KNN

In [None]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import time, timeit

In [None]:
# hyperparameter tuning to find out optimium value for n_neighbors. Initialy the range was high, due to run time the range reduced to 10. 
error_rate = []
acc_score = []
# Will take some time
for i in range(1,10):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))
    acc_score.append(accuracy_score(y_test, pred_i))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,10),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=2)

plt.title('Error Rate vs. n_neighbour Value')
plt.xlabel('n_neighbour')
plt.ylabel('Error Rate')

In [None]:
plt.plot(range(1,10),acc_score,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=2)
plt.title('Acc_Score vs. n_neighbour Value')
plt.xlabel('n_neighbour')
plt.ylabel('Acc_Score')

In [None]:
# it is clear that the n_neighbour=1 would produce great results of accuracy and evulation. Lets run the knn analysis. 

In [None]:
## Knn Model analysis


start_time = time.time()

knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train, y_train)

y_pred_knn= knn.predict(X_test)

knn_time = (time.time() - start_time)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
## knn Model evaluation

knn = true_pred('KNN', confusion_matrix(y_pred_knn, y_test), y_test)
df_knn_score = algor_scores('knn', y_pred_knn, y_test, knn_time)
df_knn_score

In [None]:
## Decision Tree (max_depth= None)

In [None]:
dec_tree = DecisionTreeClassifier()
scaler = StandardScaler()

In [None]:
# Decision Tree model analysis

start_time = time.time()

pipe = Pipeline( steps = [('Standardscaler', scaler), ('DecisionTree', dec_tree)])

pipe.fit(X_train, y_train)

y_pred_dtree = pipe.predict(X_test)

dec_tree_time = time.time() - start_time
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
#Decision Tree evaluation

dec_tree = true_pred('dec_tree',confusion_matrix(y_test, y_pred_dtree), y_test)
df_dec_tree_score = algor_scores('dec_tree', y_pred_dtree, y_test, dec_tree_time)
df_dec_tree_score

In [None]:
# Ada Boost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier

In [None]:
ada_boost = AdaBoostClassifier(n_estimators= 100, learning_rate= 1, algorithm= 'SAMME', random_state= 0)


In [None]:
#AdaBoost model analysis
start_time = time.time()


ada_boost.fit(X_train, y_train)
y_pred_ad = ada_boost.predict(X_test)


ada_boost_time = time.time() - start_time
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
#AdaBoost evaulation

Ada_boost = true_pred('Ada_boost', confusion_matrix(y_pred_ad, y_test), y_test)
df_ada_boost_score = algor_scores('Ada_boost', y_pred_ad, y_test, ada_boost_time)
df_ada_boost_score

In [None]:
## RandomForest Classification

In [None]:
## uses decision tree for classification. The output should be similiar to Decision tree (max depth= None. )

random = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, max_features='auto', bootstrap=True, random_state=0, verbose=0)

In [None]:
#RandomForest analysis
start_time= time.time()

random.fit(X_train, y_train)
y_pred_rand = random.predict(X_test)

random_time = time.time()- start_time
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
#RandomForest evaulation

Random_forest= true_pred('Random_forest', confusion_matrix(y_pred_rand, y_test), y_test)
df_randforest_score = algor_scores('Random_forest', y_pred_rand, y_test, random_time)
df_randforest_score

In [None]:
#Bagging classifier 
Took nearly 35988.0761680603 seconds. Plus the accuracy was really low, hence not considered for analysis. 

In [None]:
# Extree Classifier

In [None]:
ext_tree_clf = ExtraTreesClassifier()

In [None]:
# Model analysis
start_time= time.time()

ext_tree_clf.fit(X_train, y_train)

y_pred_extree = ext_tree_clf.predict(X_test)

extree_time = time.time()- start_time
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
#Extree Model evaluation

ext_tree = true_pred('ext_tree', confusion_matrix(y_pred_extree, y_test), y_test)
df_xtree_score = algor_scores('extree', y_pred_extree, y_test, extree_time)
df_xtree_score

In [None]:
## KNN Bagging classifier
from sklearn.

In [None]:
# if the baggig classifier has 1 neighbor, the values will be same as knn =1, hence trying out with knn=3. 
from sklearn.ensemble import BaggingClassifier
bag_clf_knn = BaggingClassifier(base_estimator= KNeighborsClassifier(n_neighbors=3),
                            n_estimators=100, random_state = 0)

In [None]:
# KNN Bagging classifier
start_time= time.time()

bag_clf_knn.fit(X_train, y_train)

y_pred_knn_bag = bag_clf_knn.predict(X_test)

knn_bag_time = time.time()- start_time
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
#model evaluation

Bag_class_knn = true_pred('Bag_class_knn', confusion_matrix(y_pred_knn_bag, y_test), y_test)
df_bag_knn_score = algor_scores('Bag_knn', y_pred_knn_bag, y_test, knn_bag_time)
df_bag_knn_score

In [None]:
## Summary of classifiers

In [None]:
label = [4, 6, 20, 27, 53, 56, 58, 65, 66, 79]

df_summary= pd.concat([Raw_data,Random_forest,Ada_boost,dec_tree,knn,ext_tree,Bag_class_knn], axis=1, sort=True)

df_summary['label'] = label

df_summary.rename(columns={"profile_id": "rawdata_rows#"}, inplace= True)
df_summary= df_summary.sort_values(by=58, ascending= True, axis=1)
df_summary

In [None]:
df_score= pd.concat([df_knn_score, df_dec_tree_score, df_ada_boost_score, df_randforest_score, df_xtree_score,df_bag_knn_score], axis=1, sort=True)
df_score = df_score.sort_values(by='accuracy', ascending= True, axis=1)
df_score

In [None]:
## Conclusions

1. Extree classifier provides high accuracy, recall scores. It is also relatively faster. 
2. knn provides faster analysis with reasonable accuracy. 

