In [15]:
import pandas as pd
import numpy as np
import scipy.stats as st
from scipy.stats import ttest_1samp
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
df = pd.read_csv('Data/cleaned_data2.csv')
df

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,ac,l/km,temp_ratio,temp_delta
0,28.0,5.0,26,21.5,12,E10,0,0.18,1.79,9.5
1,12.0,4.2,30,21.5,13,E10,0,0.35,1.65,8.5
2,11.2,5.5,38,21.5,15,E10,0,0.49,1.43,6.5
3,12.9,3.9,36,21.5,14,E10,0,0.30,1.54,7.5
4,18.5,4.5,46,21.5,15,E10,0,0.24,1.43,6.5
...,...,...,...,...,...,...,...,...,...,...
383,16.0,3.7,39,24.5,18,SP98,0,0.23,1.36,6.5
384,16.1,4.3,38,25.0,31,SP98,1,0.27,0.81,-6.0
385,16.0,3.8,45,25.0,19,SP98,0,0.24,1.32,6.0
386,15.4,4.6,42,25.0,31,SP98,1,0.30,0.81,-6.0


In [3]:
# we can reenforce our conclusions by doing an hipothesis test

#H0 --> SP98 consumption_mean >= E10 consumption_mean
#Ha -->  SP98 consumption_mean < E10 consumption_mean
# One sided test
#sigfnicante level = 0.05

mean_sp98 = df[df['gas_type']=='SP98']['l/km'].mean()
mean_E10 =df[df['gas_type']=='E10']['l/km'].mean()

consumptions_sp98 = list(df[df['gas_type']=='SP98']['l/km'])

t_statistic, p_value = st.ttest_1samp(consumptions_sp98, popmean = mean_E10, alternative = "less")
t_statistic, p_value

p_value > 0.05 # so we can´t reject Null Hipothesis therefore it's very likely that SP98 have greater consumption.
#however our data set is to small:

    #To reeinforce our conclusions we need more data about consumption of vehicles with different types of gas


True

In [4]:
p_value

0.8931361363377243

In [5]:
# we can reenforce our conclusions by doing an hipothesis test

#H0 --> E10 consumption_mean < SP98 consumption_mean
#Ha -->  SP98 consumption_mean >= E10 consumption_mean
# One sided test
#sigfnicante level = 0.05

mean_sp98 = df[df['gas_type']=='SP98']['l/km'].mean()
mean_E10 =df[df['gas_type']=='E10']['l/km'].mean()

consumptions_sp98 = list(df[df['gas_type']=='SP98']['l/km'])
consumptions_E10 = list(df[df['gas_type']=='E10']['l/km'])

import scipy.stats as st
from scipy.stats import ttest_1samp

t_statistic, p_value = st.ttest_1samp(consumptions_E10, popmean = mean_sp98, alternative = "less")
t_statistic, p_value

p_value > 0.05 # so we can´t reject Null Hipothesis therefore it's very likely that SP98 have greater consumption.
#however our data set is to small:

    #To reeinforce our conclusions we need more data about consumption of vehicles with different types of gas

True

In [6]:
#model
#checking correlations
#encoding gas type

def gas_type_encod(x):
    if x =='SP98':
        return 0
    else:
        return 1
    
df['gas_type'] = df['gas_type'].apply(gas_type_encod)
    
df.corr()

# we have very low correlations --> we'll choose the KNN classification model to predict the gas type of a vehicle
#This can be useful in the future for the company if they get some registations about the consumption of other car fleets but don't know the gas_type

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,ac,l/km,temp_ratio,temp_delta
distance,1.0,-0.128967,0.562299,0.075178,0.088175,0.053411,-0.025738,-0.256578,0.042965,-0.080869
consume,-0.128967,1.0,-0.227866,-0.160623,-0.320811,0.015327,0.096591,0.779218,0.167939,0.31099
speed,0.562299,-0.227866,1.0,0.059293,0.015411,0.09736,-0.035408,-0.355759,0.002497,-0.007281
temp_inside,0.075178,-0.160623,0.059293,1.0,0.3595,-0.010198,0.297376,-0.122864,0.028449,-0.226734
temp_outside,0.088175,-0.320811,0.015411,0.3595,1.0,-0.148705,0.167562,-0.080658,-0.524691,-0.990354
gas_type,0.053411,0.015327,0.09736,-0.010198,-0.148705,1.0,-0.105285,-0.044635,0.057871,0.153694
ac,-0.025738,0.096591,-0.035408,0.297376,0.167562,-0.105285,1.0,0.050479,-0.071067,-0.130734
l/km,-0.256578,0.779218,-0.355759,-0.122864,-0.080658,-0.044635,0.050479,1.0,-0.001534,0.065942
temp_ratio,0.042965,0.167939,0.002497,0.028449,-0.524691,0.057871,-0.071067,-0.001534,1.0,0.554386
temp_delta,-0.080869,0.31099,-0.007281,-0.226734,-0.990354,0.153694,-0.130734,0.065942,0.554386,1.0


In [7]:
df['gas_type'].value_counts() # to check class imbalance --> not to imalanced so we wont use methods to treat that

0    228
1    160
Name: gas_type, dtype: int64

In [8]:
#drop redundant columns

df.drop(['distance','consume','temp_ratio'], axis = 1, inplace = True)

In [10]:
#train test split

y = df['gas_type']
X = df.drop(['gas_type'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 19)

#transformer operations
# we only have numerical features so we just perform transformers

transformer = StandardScaler()
transformer.fit(X_train)
x_train_transformed = transformer.transform(X_train)
X_train_transf = pd.DataFrame(x_train_transformed, columns = X_train.columns)
X_train_transf

transformer.fit(X_test)
x_test_transformed = transformer.transform(X_test)
X_test_transf = pd.DataFrame(x_test_transformed, columns = X_test.columns)
X_test_transf


Unnamed: 0,speed,temp_inside,temp_outside,ac,l/km,temp_delta
0,0.403833,3.118434,1.186674,-0.338062,-0.306697,-0.784376
1,0.925915,-0.426703,-0.714254,-0.338062,-0.404074,0.695089
2,-0.118249,0.586194,-1.152929,-0.338062,-0.193090,1.318021
3,1.012929,0.008358,0.016872,-0.338062,-0.136287,-0.016612
4,0.838901,0.079746,0.747998,-0.338062,-0.387845,-0.784376
...,...,...,...,...,...,...
73,0.055778,-0.426703,1.040448,-0.338062,-0.322926,-1.173708
74,0.229805,-0.426703,0.747998,-0.338062,-0.314812,-0.862242
75,-0.727345,0.586194,0.309323,-0.338062,-0.290467,-0.239310
76,2.318134,0.079746,1.040448,-0.338062,-0.452763,-1.095842


In [16]:
#KNN CLASSIFIER MODEL

knn = KNeighborsClassifier(n_neighbors=4, p=2)
knn.fit(X_train_transf, y_train)

y_train_pred = knn.predict(X_train_transf)
y_test_pred = knn.predict(X_test_transf)

print("The accuracy in the TRAIN set is: {:.3f}".format(accuracy_score(y_train, y_train_pred)))
print("The accuracy in the TEST  set is: {:.3f}".format(accuracy_score(y_test, y_test_pred)))
print("\n")
print("The precission in the TRAIN set is: {:.3f}".format(precision_score(y_train, y_train_pred, pos_label=1)))
print("The precission in the TEST  set is: {:.3f}".format(precision_score(y_test, y_test_pred, pos_label=1)))
print("\n")
print("The recall in the TRAIN set is: {:.3f}".format(recall_score(y_train, y_train_pred, pos_label=1)))
print("The recall in the TEST  set is: {:.3f}".format(recall_score(y_test,  y_test_pred, pos_label=1)))
print("\n")
print("The F1-score for the TRAIN set is {:.2f}".format(f1_score(y_train,y_train_pred, pos_label=1)))
print("The F1-score for the TEST set is {:.2f}".format(f1_score(y_test,y_test_pred, pos_label=1)))

The accuracy in the TRAIN set is: 0.745
The accuracy in the TEST  set is: 0.667


The precission in the TRAIN set is: 0.818
The precission in the TEST  set is: 0.650


The recall in the TRAIN set is: 0.492
The recall in the TEST  set is: 0.406


The F1-score for the TRAIN set is 0.61
The F1-score for the TEST set is 0.50


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [None]:
# our model is a litle over fitted and the scores are not good enough to be used
# we for sure need a larger data set so we can train our model better and get better results
# we could try to improve our model by using other scalers, choosing better the features 
    #to be used and change the knn number of neighbors, and also deal with outliers
# However, from last experiences even we do that we will not get a much better result since the data frame size
# we can get more data with web scrapping in websites with usefull information for our purpose
    
    

In [18]:
#KNN CLASSIFIER MODEL

knn = KNeighborsClassifier(n_neighbors=3, p=2)
knn.fit(X_train_transf, y_train)

y_train_pred = knn.predict(X_train_transf)
y_test_pred = knn.predict(X_test_transf)

print("The accuracy in the TRAIN set is: {:.3f}".format(accuracy_score(y_train, y_train_pred)))
print("The accuracy in the TEST  set is: {:.3f}".format(accuracy_score(y_test, y_test_pred)))
print("\n")
print("The precission in the TRAIN set is: {:.3f}".format(precision_score(y_train, y_train_pred, pos_label=1)))
print("The precission in the TEST  set is: {:.3f}".format(precision_score(y_test, y_test_pred, pos_label=1)))
print("\n")
print("The recall in the TRAIN set is: {:.3f}".format(recall_score(y_train, y_train_pred, pos_label=1)))
print("The recall in the TEST  set is: {:.3f}".format(recall_score(y_test,  y_test_pred, pos_label=1)))
print("\n")
print("The F1-score for the TRAIN set is {:.2f}".format(f1_score(y_train,y_train_pred, pos_label=1)))
print("The F1-score for the TEST set is {:.2f}".format(f1_score(y_test,y_test_pred, pos_label=1)))

The accuracy in the TRAIN set is: 0.797
The accuracy in the TEST  set is: 0.705


The precission in the TRAIN set is: 0.773
The precission in the TEST  set is: 0.655


The recall in the TRAIN set is: 0.719
The recall in the TEST  set is: 0.594


The F1-score for the TRAIN set is 0.74
The F1-score for the TEST set is 0.62


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
