## В данном ноутбуке обучаем несколько моделей
* Первая будет предсказывать население
* Вторая предсказывает ВВП
* Третья рост цен
* Четвертая предсказывает по всем получаенным данным продажи на рынке автомобилей.

In [142]:
from sklearn.preprocessing import LabelEncoder
import pycountry
import numpy as np


iso_encoder = LabelEncoder()
iso_encoder.fit([country.alpha_3 for country in pycountry.countries])
np.save('../models/iso_encoder_classes.npy', iso_encoder.classes_)

In [143]:
import pandas as pd 
import plotly.express as px

In [144]:
df = pd.read_csv('..\data\countries_populations.csv', index_col=0)
df.head()

Unnamed: 0,country_name,iso_alpha,year,value
0,Aruba,ABW,1960,54608.0
1,Aruba,ABW,1961,55811.0
2,Aruba,ABW,1962,56682.0
3,Aruba,ABW,1963,57475.0
4,Aruba,ABW,1964,58178.0


In [145]:
px.choropleth(
    df, 
    locations="iso_alpha",  
    color="value", 
    hover_name="country_name", 
    animation_frame="year", 
    color_continuous_scale=px.colors.sequential.Plasma,  
    projection="natural earth" 
)

In [146]:
training_df = df[['iso_alpha', 'year', 'value']].copy()
training_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13545 entries, 0 to 13544
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   iso_alpha  13545 non-null  object 
 1   year       13545 non-null  int64  
 2   value      13515 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 423.3+ KB


In [147]:
from sklearn.preprocessing import LabelEncoder
import numpy as np


iso_encoder = LabelEncoder()
iso_encoder.classes_ = np.load('..\models\iso_encoder_classes.npy')

training_df['iso_alpha'] = iso_encoder.transform(training_df['iso_alpha'])

In [148]:
training_df = training_df.dropna()
training_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13515 entries, 0 to 13544
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   iso_alpha  13515 non-null  int32  
 1   year       13515 non-null  int64  
 2   value      13515 non-null  float64
dtypes: float64(1), int32(1), int64(1)
memory usage: 369.6 KB


In [149]:
X = training_df[['iso_alpha', 'year']]
y = training_df['value']

In [150]:
training_df

Unnamed: 0,iso_alpha,year,value
0,0,1960,54608.0
1,0,1961,55811.0
2,0,1962,56682.0
3,0,1963,57475.0
4,0,1964,58178.0
...,...,...,...
13540,248,2018,15052184.0
13541,248,2019,15354608.0
13542,248,2020,15669666.0
13543,248,2021,15993524.0


In [151]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size=25)

In [152]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)


LinearRegression()

In [153]:

from sklearn.metrics import r2_score

r2 = r2_score(y_train, model.predict(X_train))
r2

0.006733263802691236

In [154]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pickle
import datetime

In [155]:
statistics_data_files = ['..\data\countries_consumer_price.csv', '..\data\countries_populations.csv', '..\data\countries_gdp.csv']


In [156]:

def get_model_by_statsdata(statistics_name: str, stats_dataframe: pd.DataFrame, iso_encoder: LabelEncoder):
    training_df = stats_dataframe[['iso_alpha', 'year', 'value']].copy()

    training_df['iso_alpha'] = iso_encoder.transform(training_df['iso_alpha'])
    training_df = training_df.dropna()

    X = training_df[['iso_alpha', 'year']]
    y = training_df['value']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=10, test_size=10)

    model = LinearRegression()

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    return {
        "statistics_name": statistics_name,
        "training_time": datetime.datetime.utcnow().isoformat(),
        "r2_score": r2_score(y_test, y_pred),
        "model": model,
    }

In [157]:
from pprint import pprint
from sklearn.preprocessing import LabelEncoder
import numpy as np
from pathlib import Path

iso_encoder = LabelEncoder()
iso_encoder.classes_ = np.load('..\models\iso_encoder_classes.npy')

results = []
for path in statistics_data_files:
    df = pd.read_csv(path)
    df_name = Path(path).stem
    training_results = get_model_by_statsdata(df_name, df, iso_encoder)
    results.append(training_results)
    pprint(training_results)
    pickle.dump(training_results["model"], open(f'..\models\{df_name}.sav', 'wb'))

{'model': LinearRegression(),
 'r2_score': 0.5745582619657086,
 'statistics_name': 'countries_consumer_price',
 'training_time': '2024-01-11T02:43:04.150291'}
{'model': LinearRegression(),
 'r2_score': -0.0018626433804764808,
 'statistics_name': 'countries_populations',
 'training_time': '2024-01-11T02:43:04.167544'}
{'model': LinearRegression(),
 'r2_score': -325.9532569018656,
 'statistics_name': 'countries_gdp',
 'training_time': '2024-01-11T02:43:04.182662'}


Теперь предскажем недостающие значения у статистических датасетов, и построи модель предсказания продаж автомобилей

In [158]:
from pprint import pprint
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

iso_encoder = LabelEncoder()
iso_encoder.classes_ = np.load('..\models\iso_encoder_classes.npy')

In [194]:
car_sales_df = pd.read_csv('..\data\cars_sales.csv', index_col=0)
car_sales_df.head()

Unnamed: 0,year,month,sales,iso_alpha
24,2014,1,40489,ARE
82,2014,2,38774,ARE
142,2014,3,40539,ARE
201,2014,4,39702,ARE
260,2014,5,41801,ARE


In [195]:
car_sales_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7023 entries, 24 to 6991
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   year       7023 non-null   int64 
 1   month      7023 non-null   int64 
 2   sales      7023 non-null   int64 
 3   iso_alpha  7023 non-null   object
dtypes: int64(3), object(1)
memory usage: 274.3+ KB


In [196]:
car_sales_countries = car_sales_df['iso_alpha'].unique()
car_sales_countries

array(['ARE', 'ARG', 'AUS', 'AUT', 'BEL', 'BGR', 'BLR', 'BRA', 'CAN',
       'CHE', 'CHL', 'CHN', 'COL', 'CZE', 'DEU', 'DNK', 'EGY', 'ESP',
       'EST', 'FIN', 'FRA', 'GBR', 'GRC', 'HRV', 'HUN', 'IDN', 'IND',
       'IRL', 'IRN', 'ISR', 'ITA', 'JPN', 'KAZ', 'KOR', 'KWT', 'LAO',
       'LUX', 'MEX', 'MMR', 'MYS', 'NLD', 'NOR', 'NZL', 'OMN', 'PAK',
       'PHL', 'POL', 'PRT', 'ROU', 'RUS', 'SAU', 'SGP', 'SVK', 'SVN',
       'SWE', 'THA', 'TUR', 'TWN', 'UKR', 'URY', 'USA', 'UZB', 'VEN',
       'VNM', 'ZAF'], dtype=object)

In [197]:
car_sales_years = car_sales_df['year'].unique()
car_sales_years

array([2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023],
      dtype=int64)

In [198]:
statistics_data = [
    {
        'name': 'consumer_price',
        "model": pickle.load(open('..\models\countries_consumer_price.sav', 'rb')),
        "data": pd.read_csv('..\data\countries_consumer_price.csv', index_col=0)
    },
    {
        'name': 'populations',
        "model": pickle.load(open('..\models\countries_populations.sav', 'rb')),
        "data": pd.read_csv('..\data\countries_populations.csv', index_col=0)
    },
    {
        'name': 'gdp',
        "model": pickle.load(open('..\models\countries_gdp.sav', 'rb')),
        "data": pd.read_csv('..\data\countries_gdp.csv', index_col=0)
    }
]

In [199]:
datamodel = statistics_data[1]
df = datamodel['data']
model = datamodel['model']
name = datamodel['name']

In [200]:
car_sales_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7023 entries, 24 to 6991
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   year       7023 non-null   int64 
 1   month      7023 non-null   int64 
 2   sales      7023 non-null   int64 
 3   iso_alpha  7023 non-null   object
dtypes: int64(3), object(1)
memory usage: 274.3+ KB


In [201]:
car_sales_df[name] = np.nan

In [202]:

car_sales_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7023 entries, 24 to 6991
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   year         7023 non-null   int64  
 1   month        7023 non-null   int64  
 2   sales        7023 non-null   int64  
 3   iso_alpha    7023 non-null   object 
 4   populations  0 non-null      float64
dtypes: float64(1), int64(3), object(1)
memory usage: 329.2+ KB


In [203]:
for country_code in car_sales_countries:
    if len(df[df['iso_alpha'] == country_code][~df[df['iso_alpha'] == country_code].isna().any(axis=1)])==0:
        continue
    for year in car_sales_years:
        sales_data = car_sales_df[(car_sales_df['year'] == year) & (car_sales_df['iso_alpha'] == country_code)].sort_values(by=['month'], ascending=True)
        if len(sales_data) < 1:
            continue

        values = df[(df['iso_alpha'] == country_code) & (df['year'] == year)]['value'].values
        if  len(values) != 1:
            pred_data = pd.DataFrame([{'iso_alpha': country_code, 'year': year }]).reindex_like(df[['iso_alpha', 'year']]).dropna()
            pred_data['iso_alpha'] = iso_encoder.transform(pred_data['iso_alpha'])
            values = model.predict(pred_data)
        car_sales_df.at[sales_data.index[0], name] =  values[0] 

                    


In [204]:
car_sales_df[car_sales_df['iso_alpha']=='AUS'].tail(25)

Unnamed: 0,year,month,sales,iso_alpha,populations
5632,2021,9,82942,AUS,
5690,2021,10,74295,AUS,
5746,2021,11,80359,AUS,
5804,2021,12,78141,AUS,
5860,2022,1,75468,AUS,26005540.0
5914,2022,2,84809,AUS,
5972,2022,3,100623,AUS,
6031,2022,4,80509,AUS,
6089,2022,5,93753,AUS,
6147,2022,6,99245,AUS,


In [170]:
for country_code in car_sales_countries:
    if len(car_sales_df[car_sales_df['iso_alpha'] == country_code][~car_sales_df[car_sales_df['iso_alpha'] == country_code].isna().any(axis=1)])==0:
        continue
    car_sales_df[car_sales_df['iso_alpha'] == country_code] = car_sales_df[car_sales_df['iso_alpha'] == country_code].interpolate(method='linear',
                                                                                                                                limit_direction='both',
                                                                                                                                limit=100)

In [171]:
car_sales_df[car_sales_df['iso_alpha']=='AUS'].tail(25)

Unnamed: 0,year,month,sales,iso_alpha,populations
5632,2021,9,82942,AUS,25898830.0
5690,2021,10,74295,AUS,25925510.0
5746,2021,11,80359,AUS,25952190.0
5804,2021,12,78141,AUS,25978860.0
5860,2022,1,75468,AUS,26005540.0
5914,2022,2,84809,AUS,27633740.0
5972,2022,3,100623,AUS,29261940.0
6031,2022,4,80509,AUS,30890140.0
6089,2022,5,93753,AUS,32518340.0
6147,2022,6,99245,AUS,34146540.0


In [172]:
car_sales_df = car_sales_df.dropna()

In [173]:
car_sales_df[car_sales_df['iso_alpha']=='AUS']

Unnamed: 0,year,month,sales,iso_alpha,populations
15,2014,1,82285,AUS,2.347569e+07
71,2014,2,86818,AUS,2.350405e+07
131,2014,3,97267,AUS,2.353240e+07
191,2014,4,80710,AUS,2.356076e+07
248,2014,5,94562,AUS,2.358912e+07
...,...,...,...,...,...
6767,2023,5,104987,AUS,4.554393e+07
6820,2023,6,123938,AUS,4.554393e+07
6877,2023,7,96352,AUS,4.554393e+07
6928,2023,8,109381,AUS,4.554393e+07


In [174]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder

In [175]:
iso_encoder = LabelEncoder()
iso_encoder.classes_ = np.load('..\models\iso_encoder_classes.npy')

In [176]:
statistics_data = [
    {
        'name': 'consumer_price',
        "model": pickle.load(open('..\models\countries_consumer_price.sav', 'rb')),
        "data": pd.read_csv('..\data\countries_consumer_price.csv', index_col=0)
    },
    {
        'name': 'populations',
        "model": pickle.load(open('..\models\countries_populations.sav', 'rb')),
        "data": pd.read_csv('..\data\countries_populations.csv', index_col=0)
    },
    {
        'name': 'gdp',
        "model": pickle.load(open('..\models\countries_gdp.sav', 'rb')),
        "data": pd.read_csv('..\data\countries_gdp.csv', index_col=0)
    }
]

In [177]:
car_sales_df = pd.read_csv('..\data\cars_sales.csv', index_col=0)

In [178]:
def build_car_sales_dataset(carsales_df: pd.DataFrame, statistics_datamodel: dict, iso_encoder: LabelEncoder) -> pd.DataFrame:
    sales_dataset = carsales_df.copy()
    car_sales_countries = sales_dataset['iso_alpha'].unique()
    car_sales_years = sales_dataset['year'].unique()

    df = statistics_datamodel['data']
    model = statistics_datamodel['model']
    name = statistics_datamodel['name']

    sales_dataset[name] = np.nan

    for country_code in car_sales_countries:
        if len(df[df['iso_alpha'] == country_code][~df[df['iso_alpha'] == country_code].isna().any(axis=1)]) == 0:
            continue
        for year in car_sales_years:
            sales_data = sales_dataset[(sales_dataset['year'] == year) & (
                sales_dataset['iso_alpha'] == country_code)].sort_values(by=['month'], ascending=True)
            if len(sales_data) < 1:
                continue

            values = df[(df['iso_alpha'] == country_code) &
                        (df['year'] == year)]['value'].values
            if len(values) != 1:
                pred_data = pd.DataFrame([{'iso_alpha': country_code, 'year': year}])
                pred_data['iso_alpha'] = iso_encoder.transform(
                    pred_data['iso_alpha'])
                values = model.predict(pred_data)
            sales_dataset.at[sales_data.index[0], name] = values[0]

    for country_code in car_sales_countries:
        if len(sales_dataset[sales_dataset['iso_alpha'] == country_code][~sales_dataset[sales_dataset['iso_alpha'] == country_code].isna().any(axis=1)]) == 0:
            continue
        sales_dataset[sales_dataset['iso_alpha'] == country_code] = sales_dataset[sales_dataset['iso_alpha'] == country_code].interpolate(method='linear',
                                                                                                                                limit_direction='both',
                                                                                                                                limit=50)

    return sales_dataset.dropna()

In [179]:
for datamodel in statistics_data:
    car_sales_df = build_car_sales_dataset(car_sales_df, datamodel, iso_encoder)

In [180]:
car_sales_df[car_sales_df['iso_alpha']=='AUS'].tail(25)

Unnamed: 0,year,month,sales,iso_alpha,consumer_price,populations,gdp
5632,2021,9,82942,AUS,142.754663,25898830.0,1648316000000.0
5690,2021,10,74295,AUS,145.065047,25925510.0,1659476000000.0
5746,2021,11,80359,AUS,147.375431,25952190.0,1670636000000.0
5804,2021,12,78141,AUS,149.685815,25978860.0,1681796000000.0
5860,2022,1,75468,AUS,151.996199,26005540.0,1692957000000.0
5914,2022,2,84809,AUS,152.302321,27633740.0,1582891000000.0
5972,2022,3,100623,AUS,152.608443,29261940.0,1472825000000.0
6031,2022,4,80509,AUS,152.914565,30890140.0,1362759000000.0
6089,2022,5,93753,AUS,153.220687,32518340.0,1252694000000.0
6147,2022,6,99245,AUS,153.526809,34146540.0,1142628000000.0


In [181]:
car_sales_df.to_csv('..\data\cars_sales_training.csv')

На основе полученного датасета обучим модель для предсказания продаж машин

In [182]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder

In [183]:
car_sales_df = pd.read_csv('..\data\cars_sales_training.csv', index_col=0)

car_sales_df

Unnamed: 0,year,month,sales,iso_alpha,consumer_price,populations,gdp
24,2014,1,40489,ARE,105.072324,8.835951e+06,4.141054e+11
82,2014,2,38774,ARE,105.428692,8.842697e+06,4.104529e+11
142,2014,3,40539,ARE,105.785059,8.849442e+06,4.068004e+11
201,2014,4,39702,ARE,106.141426,8.856188e+06,4.031479e+11
260,2014,5,41801,ARE,106.497793,8.862934e+06,3.994954e+11
...,...,...,...,...,...,...,...
6774,2023,5,41102,ZAF,203.324153,2.799135e+07,4.456446e+11
6829,2023,6,44538,ZAF,203.324153,2.799135e+07,4.456446e+11
6883,2023,7,41558,ZAF,203.324153,2.799135e+07,4.456446e+11
6937,2023,8,43502,ZAF,203.324153,2.799135e+07,4.456446e+11


In [184]:
iso_encoder = LabelEncoder()
iso_encoder.classes_ = np.load('..\models\iso_encoder_classes.npy')

In [185]:
X = car_sales_df.drop(columns=['sales'])
X['iso_alpha'] = iso_encoder.transform(X['iso_alpha'])
y = car_sales_df['sales']

In [186]:
X

Unnamed: 0,year,month,iso_alpha,consumer_price,populations,gdp
24,2014,1,7,105.072324,8.835951e+06,4.141054e+11
82,2014,2,7,105.428692,8.842697e+06,4.104529e+11
142,2014,3,7,105.785059,8.849442e+06,4.068004e+11
201,2014,4,7,106.141426,8.856188e+06,4.031479e+11
260,2014,5,7,106.497793,8.862934e+06,3.994954e+11
...,...,...,...,...,...,...
6774,2023,5,246,203.324153,2.799135e+07,4.456446e+11
6829,2023,6,246,203.324153,2.799135e+07,4.456446e+11
6883,2023,7,246,203.324153,2.799135e+07,4.456446e+11
6937,2023,8,246,203.324153,2.799135e+07,4.456446e+11


In [187]:
y

24      40489
82      38774
142     40539
201     39702
260     41801
        ...  
6774    41102
6829    44538
6883    41558
6937    43502
6991    43880
Name: sales, Length: 6789, dtype: int64

In [188]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size=25)

In [189]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state=1)

n_estimators = [int(x) for x in np.linspace(start=100, stop=500, num=10)]
max_depth = [int(x) for x in np.linspace(1, 100, num=5)]
max_depth.append(None)
max_leaf_nodes = [int(i) for i in range(2, 100)]

param_grid = {
    "criterion": ["squared_error", "friedman_mse"],
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'max_leaf_nodes': max_leaf_nodes,
}

rf_cv = RandomizedSearchCV(
    rfr,
    param_grid,
    scoring="r2",
    cv=5,
    verbose=0,
    n_jobs=-1
)


search = rf_cv.fit(X_train, y_train)


One or more of the test scores are non-finite: [       nan 0.95566884 0.76499363 0.95334097        nan        nan
        nan 0.94559645        nan 0.86092776]



In [190]:
search.best_params_


{'n_estimators': 455,
 'max_leaf_nodes': 94,
 'max_depth': 25,
 'criterion': 'friedman_mse'}

In [191]:
model = RandomForestRegressor(random_state=1, **search.best_params_)
model.fit(X_train, y_train)
r2_score(y_test, model.predict(X_test))

0.9645853939480112

In [192]:
pickle.dump(model, open(f'..\models\car_sales.sav', 'wb'))