In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime, timedelta

data = pd.read_csv("dataset.csv")

data['last_maintenance_date'] = pd.to_datetime(data['last_maintenance_date'])
current_date = datetime.now()

def calculate_maintenance_interval(row):

    if row['mileage'] < 100000:
        return 365  
    elif row['mileage'] < 200000:
        return 270  
    else:
        return 180  

data['maintenance_interval_days'] = data.apply(calculate_maintenance_interval, axis=1)

data['days_to_next_to'] = (
    data['last_maintenance_date'] 
    + pd.to_timedelta(data['maintenance_interval_days'], unit='d') 
    - current_date
).dt.days

data = data[data['days_to_next_to'] > 0]

X = data[['year_of_manufacture', 'mileage']]
y = data['days_to_next_to']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],  # Количество деревьев в лесу
    'max_depth': [10, 20, 30, None],  # Максимальная глубина дерева
    'min_samples_split': [2, 5, 10],  # Минимальное количество образцов для разделения
    'min_samples_leaf': [1, 2, 4],    # Минимальное количество образцов для листа
    'max_features': ['auto', 'sqrt', 'log2']  # Количество признаков для разбиения
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_


print(f"Оценка модели на тестовой выборке: {best_model.score(X_test, y_test)}")

def predict_next_maintenance(user_data):
    input_data = pd.DataFrame([{
        'year_of_manufacture': user_data['year_of_manufacture'],
        'mileage': user_data['mileage'],
    }])
    predicted_days = best_model.predict(input_data)[0]
    
    last_maintenance_date = pd.to_datetime(user_data['last_maintenance_date'])
    recommended_date = last_maintenance_date + pd.to_timedelta(predicted_days, unit='d')
    
    return recommended_date.date()


In [38]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Средняя абсолютная ошибка (MAE): {mae:.2f} дней")
print(f"Среднеквадратичная ошибка (RMSE): {rmse:.2f} дней")
print(f"Коэффициент детерминации (R²): {r2:.4f}")

Средняя абсолютная ошибка (MAE): 61.24 дней
Среднеквадратичная ошибка (RMSE): 74.27 дней
Коэффициент детерминации (R²): 0.2240


In [39]:
user_data = {
    'year_of_manufacture': 2018,
    'mileage': 150000,
    'last_maintenance_date': '2024-01-01'
}

# Прогноз для пользовательских данных
predicted_date = predict_next_maintenance(user_data)
print(f"Рекомендованная дата ТО для введенных данных: {predicted_date}")

Рекомендованная дата ТО для введенных данных: 2024-03-31


In [41]:
import joblib
joblib.dump(best_model, 'model.pkl')



['model.pkl']