In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import csv

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import Flatten, Dense, Embedding, Conv1D, Dropout, GlobalMaxPooling1D, BatchNormalization, MaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from mlxtend.plotting import plot_confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

import re
import nltk

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm, trange
print(tf.__version__)

2.8.0


In [None]:
df = pd.read_csv('/content/drive/MyDrive/practise/details.csv')

In [None]:
df['filename;fare;length;width;height;speed;n_axles;frml_p_os;frml_mej_os'][0]

'20.10.2020_19.55.33_O670AY196.png;79;15884.4;2940.44;3950;6.96469;5;5.66 3.30 3.40 2.78 3.10;3.484 1.301 5.953 1.267  (1569.125)'

In [None]:
df.head()

Unnamed: 0,filename;fare;length;width;height;speed;n_axles;frml_p_os;frml_mej_os
0,20.10.2020_19.55.33_O670AY196.png;79;15884.4;2...
1,20.10.2020_20.29.19_O504AH196.png;78;16037;292...
2,20.10.2020_20.35.44_B411TT89.png;68;15467;2850...
3,20.10.2020_20.38.20_X221CO750.png;49;15249;304...
4,20.10.2020_23.12.27_A905CO750.png;48;12256;287...


In [None]:
df.info

<bound method DataFrame.info of       filename;fare;length;width;height;speed;n_axles;frml_p_os;frml_mej_os
0      20.10.2020_19.55.33_O670AY196.png;79;15884.4;2...                   
1      20.10.2020_20.29.19_O504AH196.png;78;16037;292...                   
2      20.10.2020_20.35.44_B411TT89.png;68;15467;2850...                   
3      20.10.2020_20.38.20_X221CO750.png;49;15249;304...                   
4      20.10.2020_23.12.27_A905CO750.png;48;12256;287...                   
...                                                  ...                   
63845  19.04.2021_09.24.19_B674MK89.png;67;15032.3;27...                   
63846  19.04.2021_09.25.22_B312CC89.png;45;16649.8;33...                   
63847  19.04.2021_09.27.48_T548MX55.png;48;17289.2;31...                   
63848  19.04.2021_09.28.49_C888MA55.png;49;16388;2746...                   
63849  19.04.2021_09.30.02_T545MX55.png;49;16525.8;28...                   

[63850 rows x 1 columns]>

In [None]:
'''
filename - имя файла (не нужно в данном случае)
fare - собственно, правильный тариф для ТС
length, width, height - длина, ширина, высота
speed - скорость (скорее всего вообще не влияет и не нужна в рамках эксперимента)
n_axles - количество осей (сколько рядов колес, иначе говоря)
frml_p_os - межосевое расстояние (к примеру, у легковушки две оси а межосевое расстояни - 
                                                                      это расстояние между передними и задними колесами); 
                                                                      список значений с плавающей точкой через пробел
frml_mej_os - вес на каждой из осей (в суме дает общий вес ТС); 
                                                список значений с плавающей точкой через пробел 
                                                (в конце какое-то число в скобках - его отбрасывать)
'''

In [None]:
fare = []  # Правильный тариф для ТС (task target)
length = [] # Длина
width = [] # Ширина
height = [] # Высота
speed = [] # Скорость (скорее всего не нужна)
n_axles = [] # Количество осей
frml_p_os = [] # межосевое расстояние
frml_mej_os = [] # вес на каждой из осей
for row in df['filename;fare;length;width;height;speed;n_axles;frml_p_os;frml_mej_os']:
  row = row.split(';')
  fare.append(row[1])
  length.append(float(row[2]))
  width.append(float(row[3]))
  height.append(float(row[4]))
  speed.append(float(row[5]))
  n_axles.append(float(row[6]))
  row[7] = list(float(i) for i in row[7].split())
  row[7] = np.asarray(row[7])
  frml_p_os.append(row[7])
  row[8] = row[8].split()[:-1]
  row[8] = list(float(i) for i in row[8])
  row[8] = np.asarray(row[8])
  frml_mej_os.append(row[8])
df = pd.DataFrame(columns=['fare', 'length', 'width', 'height', 'n_axles', 'frml_p_os', 'frml_mej_os', 'speed'])
df['fare'] = fare
df['length'] = length
df['width'] = width
df['height'] = height
df['n_axles'] = n_axles
df['frml_p_os'] = frml_p_os
df['frml_mej_os'] = frml_mej_os
df['speed'] = speed

In [None]:
df.head()

Unnamed: 0,fare,length,width,height,n_axles,frml_p_os,frml_mej_os,speed
0,79,15884.4,2940.44,3950.0,5.0,"[5.66, 3.3, 3.4, 2.78, 3.1]","[3.484, 1.301, 5.953, 1.267]",6.96469
1,78,16037.0,2927.0,3789.0,6.0,"[5.66, 3.72, 3.8, 2.4, 2.56, 2.64]","[3.593, 1.341, 4.844, 1.329, 1.353]",8.61038
2,68,15467.0,2850.0,4033.0,5.0,"[5.16, 2.7, 2.82, 2.78, 2.68]","[3.635, 1.367, 5.182, 1.335]",7.15867
3,49,15249.0,3047.0,4022.0,4.0,"[5.92, 3.92, 3.4, 3.5]","[3.762, 5.512, 1.288]",10.7562
4,48,12256.0,2876.0,4020.0,5.0,"[6.02, 4.22, 1.5, 1.66, 1.78]","[3.85, 3.579, 1.386, 1.387]",8.75799


In [None]:
df.shape

(63850, 8)

In [None]:
fare_list = df['fare'].unique()
fare_list

array(['79', '78', '68', '49', '48', '56', '55', '5', '4', '7', '77',
       '47', '57', '69', '8', '58', '67', '2', '99', '45', '46', '59',
       '9', '710', '3', '3X', '8X', '510', '35', '6', '6X', '44', '9X',
       '4X', '5X', '7X', '1', '911', 'XX'], dtype=object)

In [None]:
len(df['fare'].unique())

39

In [None]:
frml_p_os_count = []
frml_mej_os_count = []
titles = ['frml_p_os', 'frml_mej_os']
for title in titles:
  for i in df[title]:
    if title == 'frml_p_os':
      frml_p_os_count.append(len(i))
    else:
      frml_mej_os_count.append(len(i))

print(max(frml_p_os_count))
print(max(frml_mej_os_count))

15
14


In [None]:
'''
for i in range(df.shape[0]):
  if len(df['frml_mej_os'][i]) == 7:
    print(df['frml_mej_os'][i])
    break 
'''

In [None]:
padded_p_os = pad_sequences(df['frml_p_os'], maxlen=8, dtype='float64', padding='post', truncating='post', value=0.0)
padded_mej_os = pad_sequences(df['frml_mej_os'], maxlen=7, dtype='float64', padding='post', truncating='post', value=0.0)

In [None]:
padded_p_os_list = []
padded_mej_os_list = []
for i in range(df.shape[0]):
  padded_p_os_list.append(padded_p_os[i])
  padded_mej_os_list.append(padded_mej_os[i])

In [None]:
df['frml_p_os'] = padded_p_os_list
df['frml_mej_os'] = padded_mej_os_list

In [None]:
print(type(df['fare'][0]))
print(type(df['length'][0]))
print(type(df['width'][0]))
print(type(df['height'][0]))
print(type(df['n_axles'][0]))
print(type(df['frml_p_os'][0]))
print(type(df['frml_mej_os'][0]))
print(type(df['speed'][0]))

<class 'str'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.float64'>


In [None]:
df.head()

Unnamed: 0,fare,length,width,height,n_axles,frml_p_os,frml_mej_os,speed
0,79,15884.4,2940.44,3950.0,5.0,"[5.66, 3.3, 3.4, 2.78, 3.1, 0.0, 0.0, 0.0]","[3.484, 1.301, 5.953, 1.267, 0.0, 0.0, 0.0]",6.96469
1,78,16037.0,2927.0,3789.0,6.0,"[5.66, 3.72, 3.8, 2.4, 2.56, 2.64, 0.0, 0.0]","[3.593, 1.341, 4.844, 1.329, 1.353, 0.0, 0.0]",8.61038
2,68,15467.0,2850.0,4033.0,5.0,"[5.16, 2.7, 2.82, 2.78, 2.68, 0.0, 0.0, 0.0]","[3.635, 1.367, 5.182, 1.335, 0.0, 0.0, 0.0]",7.15867
3,49,15249.0,3047.0,4022.0,4.0,"[5.92, 3.92, 3.4, 3.5, 0.0, 0.0, 0.0, 0.0]","[3.762, 5.512, 1.288, 0.0, 0.0, 0.0, 0.0]",10.7562
4,48,12256.0,2876.0,4020.0,5.0,"[6.02, 4.22, 1.5, 1.66, 1.78, 0.0, 0.0, 0.0]","[3.85, 3.579, 1.386, 1.387, 0.0, 0.0, 0.0]",8.75799


In [None]:
df_res = pd.DataFrame(columns=['data', 'fare'])
data_list = []
res = []
for i in range(df.shape[0]):
  res.append(df['length'][i])
  res.append(df['width'][i])
  res.append(df['height'][i])
  res.append(df['n_axles'][i])

  for j in df['frml_p_os'][i]:
    res.append(j)

  for k in df['frml_mej_os'][i]:
    res.append(k)
  
  #res = list(res)
  #res = tf.convert_to_tensor(res, dtype=tf.float64) 
  #res = np.asarray(res).astype(np.float64)
  
  data_list.append(res)
  res = []
df_res['data'] = data_list
df_res['fare'] = df['fare']

In [None]:
len(df_res['data'][0])

19

In [None]:
for i in range(df_res.shape[0]):
  df_res['data'][i] = np.array(df_res['data'][i])

In [None]:
X = df_res['data']
y = df_res['fare'].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
X_train

26192    [5418.56, 2649.88, 2511.0, 2.0, 2.98, 3.32, 0....
61144    [5504.0, 2821.0, 3118.0, 3.0, 3.86, 1.82, 1.72...
21490    [13554.1, 2828.78, 3334.0, 5.0, 5.66, 6.12, 5....
53722    [16934.8, 2799.22, 3502.0, 5.0, 4.66, 3.32, 3....
31413    [6087.76, 2424.72, 2649.0, 2.0, 1.4, 1.76, 0.0...
                               ...                        
62570    [12093.0, 2893.0, 3388.0, 6.0, 5.04, 3.74, 3.2...
38158    [16594.0, 2937.0, 3390.0, 5.0, 4.58, 2.88, 2.7...
860      [13370.0, 2940.0, 3426.0, 5.0, 5.54, 5.82, 6.1...
15795    [13336.6, 2878.07, 3587.0, 5.0, 5.32, 5.9, 6.1...
56422    [18471.4, 2947.02, 3940.0, 6.0, 6.36, 4.34, 3....
Name: data, Length: 57465, dtype: object

In [None]:
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(57465,) (6385,)
(57465,) (6385,)


In [None]:
encode = OneHotEncoder()

training_labels = encode.fit_transform(y_train.reshape(-1, 1))
validation_labels = encode.transform(y_test.reshape(-1, 1))

In [None]:
#encode.inverse_transform(training_labels[0].reshape(1, -1))

In [None]:
# The labels must be converted to arrays
# Convert the labels to arrays
training_labels = training_labels.toarray()
validation_labels = validation_labels.toarray()

print(type(training_labels))
print(type(validation_labels))

print(type(X_train))
print(type(X_test))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [None]:
X_train = pad_sequences(X_train, maxlen=20, dtype='float64', padding='post', truncating='post', value=0.0)
X_test = pad_sequences(X_test, maxlen=20, dtype='float64', padding='post', truncating='post', value=0.0)

In [None]:
X_train[0]

array([5.41856e+03, 2.64988e+03, 2.51100e+03, 2.00000e+00, 2.98000e+00,
       3.32000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 3.98600e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00])

In [None]:
training_labels[0]

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.])

In [None]:
model = Sequential()
model.add(Dense(64, input_dim=20, activation='relu'))
#model.add(Dropout(0.1))
model.add(Dense(39, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                1344      
                                                                 
 dense_1 (Dense)             (None, 39)                2535      
                                                                 
Total params: 3,879
Trainable params: 3,879
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Контрольная точка для сохранения модели с наилучшей производительностью на проверочном наборе
file_path = "/content/drive/MyDrive/best_model_practise/model_best_val.h5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [None]:
history = model.fit(X_train, training_labels,
                    epochs=30000, batch_size=64, 
                    validation_split=0.1,
                    callbacks=[ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=50, min_lr=0.0001), 
                               EarlyStopping(monitor='val_loss', mode='min', patience=500, verbose=1),
                               EarlyStopping(monitor='val_accuracy', mode='max', patience=500, verbose=1), 
                               checkpoint])

In [None]:
from tensorflow.keras.optimizers import SGD

In [None]:
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=20))
model.add(Dropout(0.1))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(39, activation='softmax'))
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)

model.summary()

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='Adamax',
              metrics=['accuracy'])

In [None]:
# Контрольная точка для сохранения модели с наилучшей производительностью на проверочном наборе
file_path = "/content/drive/MyDrive/best_model_practise/model_best_val.h5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [None]:
history = model.fit(X_train, training_labels,
                    epochs=30000, batch_size=128, 
                    validation_split=0.1,
                    callbacks=[ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=50, min_lr=0.0001), 
                               EarlyStopping(monitor='val_loss', mode='min', patience=500, verbose=1),
                               EarlyStopping(monitor='val_accuracy', mode='max', patience=500, verbose=1), 
                               checkpoint])

In [None]:
#score = model.evaluate(x_test, y_test, batch_size=128)

In [None]:
#Naive Bayes Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators= 500, 
                             criterion='entropy',  
                             max_depth=300, 
                             min_samples_split=20, 
                             min_samples_leaf=10,
                             min_weight_fraction_leaf=0.0,
                             max_features=20,
                             max_leaf_nodes=None,
                             min_impurity_decrease=0.0,
                             bootstrap=True,
                             oob_score=False,
                             n_jobs=None,
                             random_state=None,
                             verbose=0,
                             warm_start=False,
                             class_weight=None,
                             ccp_alpha=0.0,
                             max_samples=None
                             )

In [None]:
clf.fit(X_train, training_labels)

RandomForestClassifier(criterion='entropy', max_depth=300, max_features=20,
                       min_samples_leaf=10, min_samples_split=20,
                       n_estimators=500)

In [None]:
clf.score(X_test, validation_labels)

0.5428347689898199