In [None]:
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
# ^^^ pyforest auto-imports - don't write above this line
import numpy as np
import pandas as pd
import plotly.express as px

In [None]:
data = pd.read_csv('/kaggle/input/electric-motor-temperature/pmsm_temperature_data.csv')

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.head()

## Process

In [None]:
data.isnull().sum()

####  Data is anonymized and looks like scaled by some factor

In [None]:
data_model = data.drop(columns='profile_id')

## EDA

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
plt.figure(figsize=(20, 12))
for i,j in enumerate(data.columns[:-1]):
    plt.subplot(3, 4, i+1)
    
    skew_ = np.round(data[j].skew(), 2)
    sns.distplot(data[j], label = 'skewness = '+str(skew_))
    
    plt.vlines(data[j].mean(), ymin = 0, ymax =1, color = 'r')
    plt.legend()
plt.show()

* Our tagret(pm) looks like normally distributed.
* Most of the attributes looks like multi-model, this may be due to profile_id.
* Most of the attributes have skewness less than 0.5 expect ambient temp, coolant, i_d
    > i_d(current through d axis) : this may directly related to the current through field windings but the motor used in testing is PM(permanent magnet motor) so field flux is constant but i_d may depend on voltage and load, which inturn depend on speed.
    
    > ambient temp and coolant temp, this may directly depend on testing scenario.
* Stator_tooth and Stator_winding distributions looks like similar

In [None]:
counts_ = data['profile_id'].value_counts()

fig = px.bar(x = counts_.index,y = counts_.values , template = 'plotly_dark', labels = {'x':'Profile_id', 'y': 'Counts'}, range_x = (0,82))
fig.show()

#### Is profile_id is a random number or they cut some data of some profile Id's till 40 and afterwards it is continous

### ***Profile Id*** does not make sense while to use in a model, bcoz each test case is given a profile id and we don't know which parameters are varying, and in actual situation we don't determine the profile_id(but still we can, assuming as classfication problem(profile_id) first and later regression(pm)).
#### But still we can check the attribute changes in every profile id

In [None]:
# data['profile_id'].value_counts()

In [None]:
plt.figure(figsize=(25, 40))
for i,j in enumerate(data['profile_id'].unique()):
    data_ = data[data['profile_id'] == j]
    
    plt.subplot(11,5, i + 1)
    sns.distplot(data_['pm'],label = 'profile_id = '+str(j))
    plt.legend()
plt.show()

> I thought the distribution of pm is slightly normal, but for each test case it is a multi-modal distribution.

In [None]:
plt.figure(figsize=(25, 50))
for i,j in enumerate(data['profile_id'].unique()):
    data_ = data[data['profile_id'] == j]
    
    plt.subplot(11,5, i + 1)
    sns.distplot(data_['ambient'], hist = False, label = 'ambient')
    sns.distplot(data_['coolant'], hist = False, label = 'coolant')
    plt.title('profile_id = '+str(j))
    plt.legend()
    
plt.show()

> The temp of coolant can increase if increase in heat from motor(some cases might be continous working, critical tests,high eddy currents) and also increase in ambinet temp around the motor.

> As the data is anonymized, We can hypothesis that if density of ambient is greater than zero is more, then likely the density of coolant greater than zero is more.
But only few cases follow the hypothesis.

In [None]:
plt.figure(figsize=(25, 55))
for i,j in enumerate(data['profile_id'].unique()):
    data_ = data[data['profile_id'] == j]
    
    plt.subplot(11,5, i + 1)
    sns.distplot(data_['motor_speed'], hist = False, label = 'motor_speed')
    sns.distplot(data_['torque'], hist = False, label = 'torque')
    plt.title('profile_id = '+str(j))
    plt.legend()
    
plt.show()

> Torque and Speed are inversly proportional.

> But the density plotly are slightly overlapping, which shouldn't be the case, This may be because of the axis are normalized  

> The test cases from 46 to 59 follow some type of pattern and remaining won't.

In [None]:
plt.figure(figsize=(25, 59))
for i,j in enumerate(data['profile_id'].unique()):
    data_ = data[data['profile_id'] == j]
    
    plt.subplot(11,5, i + 1)
    sns.distplot(data_['stator_winding'], hist = False, label = 's_winding')
    sns.distplot(data_['stator_yoke'], hist = False, label = 's_yoke')
    sns.distplot(data_['stator_tooth'], hist = False, label = 's_tooth')
    plt.title('profile_id = '+str(j))
    plt.legend()
    
plt.show()

> We can clearly observe that Stator winding and Stator tooth are more overlapping in most of the cases than yoke. This is obivous because the winding sits in tooth. 

> We may have multi-colinearity if we use both winding and tooth. 

#### lets Check attributes w.r.t pm in different test cases.

In [None]:
data.corr()['pm']

In [None]:
fig = px.scatter(data,x = 'stator_yoke' ,y='pm',template='plotly_dark')
fig.show()

In [None]:
plt.figure(figsize=(25, 40))
for i,j in enumerate(data['profile_id'].unique()):
    data_ = data[data['profile_id'] == j]
    
    plt.subplot(11,5, i + 1)
    plt.xticks([-2.5,-1.5,0,1.5,2.5])
    plt.yticks([-3,-2,-1,0,1,2,3])
    sns.regplot(x = data_['stator_yoke'] ,y=data_['pm'],label = 'profile_id = '+str(j))
    plt.legend()
plt.show()

> By looking at the above graphs we can clearly say the test cases are very distinct, Some of them are strongly positive co-related and some moderately and some are neutral(52,59).

In [None]:
fig = px.scatter(data,x = 'stator_tooth' ,y='pm',template='plotly_dark')
fig.show()

In [None]:
plt.figure(figsize=(25, 40))
for i,j in enumerate(data['profile_id'].unique()):
    data_ = data[data['profile_id'] == j]
    
    plt.subplot(11,5, i + 1)
    plt.xticks([-2.5,-1.5,0,1.5,2.5])
    plt.yticks([-3,-2,-1,0,1,2,3])
    sns.regplot(x = data_['stator_tooth'] ,y=data_['pm'],label = 'profile_id = '+str(j))
    plt.legend()
plt.show()

In [None]:
fig = px.scatter(data,x = 'ambient' ,y='pm',template='plotly_dark')
fig.show()

In [None]:
plt.figure(figsize=(25, 40))
for i,j in enumerate(data['profile_id'].unique()):
    data_ = data[data['profile_id'] == j]
    
    plt.subplot(11,5, i + 1)
    plt.yticks([-3,-2,-1,0,1,2,3])
    plt.xticks([-10,-5,0,2.5,5])
    sns.regplot(x = data_['ambient'] ,y=data_['pm'], label = 'profile_id = '+str(j))
    plt.legend()
plt.show()

> from above graphs we can observe that ambient temp is also a factor of testing parameters, bcoz in some test cases they are postively co-related and in some test cases they are negatively co-related.

In [None]:
fig = px.scatter(data,x = 'coolant' ,y='pm',template='plotly_dark')
fig.show()

In [None]:
plt.figure(figsize=(25, 40))
for i,j in enumerate(data['profile_id'].unique()):
    data_ = data[data['profile_id'] == j]
    
    plt.subplot(11,5, i + 1)
    plt.yticks([-3,-2,-1,0,1,2,3])
    plt.xticks([-2,-1,0,1,2,3])
    sns.regplot(x = data_['coolant'] ,y=data_['pm'], label = 'profile_id = '+str(j))
    plt.legend()
plt.show()

> We can find some pattern in pm temp and coolant temp in most of the test cases.

> some of the interseting test cases to dig deeper are 51,53,62,69,78

## Feature-Selection

> If we isolate some test_cases(profile_id) we may get better results while predicting pm, but for now lets consider all test cases.

In [None]:
X,Y = data.drop(columns = 'pm'), data['pm']

In [None]:
data.corr()['pm'] 

In [None]:
selected_cols = ['stator_tooth','stator_winding','stator_yoke','ambient']

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression,f_regression,f_oneway

In [None]:
fs = SelectKBest(score_func=f_regression, k='all')
fs.fit(X, Y)

In [None]:
fig = px.bar(x =X.columns, y = fs.scores_, template = 'plotly_dark')
fig.show()

In [None]:
plt.rcParams['figure.figsize'] = (15,8)

In [None]:
def check_mutlicolinearity(data_x):
    corr = data_x.corr()
    corr = pd.DataFrame(np.tril(corr, k=-1),      # gets Lower triangular matrix
                        columns=data_x.columns,
                        index=data_x.columns)  

    corr = corr.replace(0.000000, np.NAN)
    count_of_total_correlation_values = corr.count().sum()

    for i in [0.5, 0.6, 0.7, 0.8, 0.9]:
        data_corr = corr[abs(corr) > i]
        count_greater_than_thresh = data_corr.count().sum()
        print(f'Percent Values Greater than {i} co-relation : {count_greater_than_thresh/count_of_total_correlation_values}')
    return corr

def plot_corr(threshold, corr):
    data_corr = corr[abs(corr) > threshold]
    sns.heatmap(data_corr, annot=True, cmap="YlGnBu", center=0)
    plt.show()

In [None]:
corr = check_mutlicolinearity(X)

In [None]:
plot_corr(0.7, corr)

> As torque directly depends on current through quadratic axis.

In [None]:
data.corr()['pm']

In [None]:
from scipy.stats import bartlett

H0 : variance_1 = variance_2

H1 : variance_1 != variance_2

pvalue is less than 0.05. So we reject the null hypothesis and can say that variance of attribute_1 is not equal to the variance of attribute_2

pvalue is higher than 0.05. So we fail to reject the null hypothesis and can say that we do not have enough evidence to reject the null hypothesis.                      
So we ***do not have enough evidence*** to prove that variance of attribute_1 is not equal to the variance of attribute_2.

In [None]:
bartlett(data['i_q'],data['torque'])  # Can remove one feature

In [None]:
bartlett(data['stator_winding'],data['stator_tooth'])  # Can remove one feature , drop stator_tooth

In [None]:
bartlett(data['stator_yoke'],data['stator_tooth'])  # Can remove one feature, drop stator_tooth

In [None]:
bartlett(data['stator_yoke'],data['coolant'])  # Can remove one feature, but lets keep both

In [None]:
bartlett(data['torque'],data['u_d'])  # Can remove one feature, drop torque

In [None]:
bartlett(data['i_d'],data['motor_speed'])  # keep both

In [None]:
bartlett(data['u_d'],data['motor_speed'])  # keep both

## Modelling 

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor,AdaBoostRegressor,GradientBoostingRegressor,RandomForestRegressor,VotingRegressor
from sklearn.model_selection import cross_val_score,GridSearchCV,KFold

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
import statsmodels.api as sm
from statsmodels.api import add_constant

In [None]:
X_new_c=sm.add_constant(X)

model2=sm.OLS(Y,X_new_c).fit()
model2.summary()

> Durbin-watson statistic is less than 2, so there is negative auto correlation

In [None]:
X_selected = X.drop(columns = ['profile_id','torque'])
X_new=sm.add_constant(X_selected)

model=sm.OLS(Y,X_new).fit()
model.summary()

In [None]:
train_error = mean_squared_error(Y, model.predict(X_new))
train_error

While using the ols to test on whole trained data in a ***range of -3 to +3***, we are getting ***mse = 0.225***

In [None]:
# GB_bias=[]
# GB_ve=[]
# for n in np.arange(90,100):
#     GB=GradientBoostingRegressor(n_estimators=n,random_state=0)
#     scores=cross_val_score(GB,X_selected,Y,cv=2,scoring='neg_mean_squared_error', n_jobs = -1)
#     rmse=np.sqrt(np.abs(scores))
#     GB_bias.append(np.mean(rmse))
#     GB_ve.append((np.std(rmse,ddof=1)))

# # x_axis=np.arange(len(GB_bias))
# # plt.plot(x_axis,GB_bias)

# np.argmin(GB_bias)

In [None]:
# bias=[]
# ve=[]
# LR=LinearRegression()

# for n in np.arange(20,60):
#     mod=AdaBoostRegressor(base_estimator=LR,n_estimators=n,random_state=0)
#     scores=cross_val_score(mod,X_selected,Y,cv=3,scoring='neg_mean_squared_error', n_jobs = -1)
#     bias.append(np.mean(rmse))
#     ve.append((np.std(rmse,ddof=1)))

# # x_axis=np.arange(len(bias))
# # plt.plot(x_axis,bias)

# np.argmin(bias)

In [None]:
# bias=[]
# ve=[]
# for n in np.arange(10,60):
#     mod=AdaBoostRegressor(n_estimators=n,random_state=0)
#     scores=cross_val_score(mod,X_selected,Y,cv=3,scoring='neg_mean_squared_error', n_jobs = -1)
#     bias.append(np.mean(rmse))
#     ve.append((np.std(rmse,ddof=1)))

# # x_axis=np.arange(len(bias))
# # plt.plot(x_axis,bias)

# np.argmin(bias)

In [None]:
LR=LinearRegression()
LR_AB=AdaBoostRegressor(base_estimator=LR,n_estimators = 100 ,random_state=0)
DT_AB=AdaBoostRegressor(n_estimators = 50 ,random_state=0)
LR_GB=GradientBoostingRegressor(n_estimators = 100, random_state=0)
RF=RandomForestRegressor(criterion='mse',random_state=0)

In [None]:
models = []
models.append(('LinearRegression', LR))
models.append(('Adaboost',LR_AB))
models.append(('DT_boost',DT_AB))
models.append(('GBoost',LR_GB))
models.append(('RF',RF))

In [None]:
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = KFold(shuffle=True,n_splits=3,random_state=0)
    cv_results = cross_val_score(model, X_selected, Y,cv=kfold, scoring='neg_mean_squared_error', n_jobs = -1)
    results.append(np.sqrt(np.abs(cv_results)))
    names.append(name)
    
    print("%s: %f (%f)" % (name, np.mean(np.sqrt(np.abs(cv_results))),np.std(np.sqrt(np.abs(cv_results)),ddof=1)))


In [None]:
from sklearn import neighbors
knn=neighbors.KNeighborsRegressor()

param_grid={
    'n_neighbors':np.arange(1,5),
    'weights':['uniform', 'distance']}

kfold= KFold(n_splits=3,shuffle=True,random_state=1)
model= GridSearchCV(estimator=knn,
                        param_grid=param_grid,
                        scoring='neg_mean_squared_error',
                        cv=kfold,
                        refit=True,
                        verbose=5,
                        n_jobs=-1)
                        
model.fit(X_selected,Y)

print('Best Scorer{}'.format(model.best_score_))
print()
print('Best Parameters{}'.format(model.best_params_))

In [None]:
res = pd.DataFrame(model.cv_results_)
res.sort_values('rank_test_score').head(3)