## Импорт библиотек

In [1]:
import pandas as pd

import re

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

import joblib

## Читаем датасет и оставляем фичи для дальнейшей работы

In [2]:
df1 = pd.read_csv('data/used_cars.csv')

In [3]:
features = ['brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine', 'ext_col', 'price']

In [4]:
df1 = df1[features]

In [5]:
df2 = pd.read_csv('data/cars_data_VERSION2.csv')

In [6]:
df2 = df2.drop(['Used/New', 'ConsumerRating',
       'ConsumerReviews', 'SellerType', 'SellerName', 'SellerRating',
       'SellerReviews', 'StreetName', 'State', 'Zipcode', 'DealType',
       'ComfortRating', 'InteriorDesignRating', 'PerformanceRating',
       'ValueForMoneyRating', 'ExteriorStylingRating', 'ReliabilityRating',
       'MinMPG', 'MaxMPG', 'VIN', 'Stock#', 'InteriorColor', 'Transmission', 'Drivetrain'], axis=1)

In [7]:
df2 = df2.rename(columns={'Year': 'model_year', 'Make': 'brand', 'Model': 'model', 'Mileage': 'milage',
                                   'Engine': 'engine', 'FuelType': 'fuel_type', 'ExteriorColor': 'ext_col', 'Price': 'price'})

In [8]:
df2['milage']  = df2['milage'] / 100

In [9]:
df1['milage'] = df1['milage'].apply(lambda x: x[:-4].replace(',', '.')).map(float) * 1.61

In [10]:
dataset = pd.concat([df1, df2])

In [11]:
dataset.model_year = dataset.model_year.astype('int64')

In [12]:
dataset = dataset[dataset['model_year'] > 2000]

## Предобработка данных

In [13]:
dataset = dataset.rename(columns={'model_year': 'year'})

In [14]:
fuel = dataset.fuel_type.replace(['–', 'not supported', 'Electric Fuel System',
       'Gasoline/Mild Electric Hybrid', 'Flex Fuel Capability',
       'not supported', 'Flexible Fuel', 'Plug-In Electric/Gas',
       'Diesel Fuel', 'Gasoline Fuel'], 'Gasoline')

fuel = fuel.replace(['Plug-In Hybrid'], 'Electric')
fuel = fuel.fillna('Gasoline')
dataset['fuel_type'] = fuel

In [15]:
l = []
pattern = '\d\.\dL'
for engine in dataset.engine:
    result = re.search(pattern, engine)
    if result:
        l.append(float(result.group()[:-1]))
    else:
        l.append(0)
        
engine = pd.Series(l)
engine = engine.replace(0, engine.median())
dataset['engine'] = engine

In [16]:
selected_colors = df1.ext_col.value_counts().head(12).index
all_colors = dataset.ext_col.unique()

colors_to_drop = [color for color in all_colors if color not in selected_colors]

color = dataset['ext_col'].replace(colors_to_drop, 'Black')

dataset['ext_col'] = color
dataset = dataset.rename(columns={'ext_col': 'color'})

In [17]:
dataset = dataset.loc[dataset['price'] != 'Not Priced']

dataset['price'] = dataset['price'].map(lambda x: float(x[1:].replace(',', '')))

## Строим модель



In [18]:
df_for_model = pd.get_dummies(dataset, dtype=float)

In [19]:
X = df_for_model.drop(['price'], axis=1)
y = df_for_model['price']

In [20]:
clf = RandomForestRegressor(criterion='squared_error', n_estimators=100, n_jobs=-1)

In [21]:
clf.fit(X, y)

In [22]:
joblib.dump(clf, 'model.joblib', compress=9)

['model.joblib']

## Создаем шаблон для передачи в нашу модель 

In [23]:
from_sample = X.iloc[:2]

In [24]:
joblib.dump(from_sample, 'from_sample.joblib', compress=9)

['from_sample.joblib']

In [81]:
dataset.to_csv('data/dataset.csv')

## Сохраняем данные для формы на сайте

In [None]:
brand = dataset.brand

brand = dataset.brand.value_counts().reset_index()['brand'].sort_values()

#Модель
# dataset.groupby('brand').count()

year = [i for i in range(2000, 2025)]

fuel_type = dataset.fuel_type.value_counts().reset_index().fuel_type

engine = dataset.engine.value_counts().reset_index().engine.sort_values()

color = dataset.color.value_counts().reset_index().color