# Описание

В работе используются два источника:
1. Главный источник https://www.chinamobil.ru/sales/sales_all/ – в нем содержится по месечная статистика продаж авито по странам, которые мы и будем предсказывать
2. Ежегодные экономические показатели стран: население, ВВП на душу населения, средняя зарплата



### Парсмим данные продаж авто

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
import os
from urllib.parse import urlparse, parse_qs


In [2]:
SAVE_DIR = r'..\data\\'
SALES_STATS_URL = 'https://www.chinamobil.ru/sales/sales_all/'

In [3]:
url = '?year=14&mon=1'
data = []

while True:
    print(f'Requesting URL: {url}')
    page = requests.get(SALES_STATS_URL + url)

    soup = BeautifulSoup(page.content, "html.parser")

    country_sales = soup.findAll(attrs={'name': 'salesrow'})

    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    for sales_stat in country_sales:
        data.append(
            {
                "country": sales_stat.find('a').text.strip(),
                "year": 2000 + int(query_params['year'][0]),
                "month": int(query_params['mon'][0]),
                "sales": int(sales_stat.find('b').text.replace(' ', ''))
            }
        )

    next_month = soup.find(string='Следующий месяц')
    if not next_month or next_month.parent.name != 'a':
        break

    url = next_month.parent.attrs['href']

df = pd.DataFrame(data)

Requesting URL: ?year=14&mon=1
Requesting URL: ?year=14&mon=2
Requesting URL: ?year=14&mon=3
Requesting URL: ?year=14&mon=4
Requesting URL: ?year=14&mon=5
Requesting URL: ?year=14&mon=6
Requesting URL: ?year=14&mon=7
Requesting URL: ?year=14&mon=8
Requesting URL: ?year=14&mon=9
Requesting URL: ?year=14&mon=10
Requesting URL: ?year=14&mon=11
Requesting URL: ?year=14&mon=12
Requesting URL: ?year=15&mon=1
Requesting URL: ?year=15&mon=2
Requesting URL: ?year=15&mon=3
Requesting URL: ?year=15&mon=4
Requesting URL: ?year=15&mon=5
Requesting URL: ?year=15&mon=6
Requesting URL: ?year=15&mon=7
Requesting URL: ?year=15&mon=8
Requesting URL: ?year=15&mon=9
Requesting URL: ?year=15&mon=10
Requesting URL: ?year=15&mon=11
Requesting URL: ?year=15&mon=12
Requesting URL: ?year=16&mon=1
Requesting URL: ?year=16&mon=2
Requesting URL: ?year=16&mon=3
Requesting URL: ?year=16&mon=4
Requesting URL: ?year=16&mon=5
Requesting URL: ?year=16&mon=6
Requesting URL: ?year=16&mon=7
Requesting URL: ?year=16&mon=8
Re

Добавим коды стран к собраному датасету, для отображения данных на карте

In [4]:
import gettext
import pycountry
from gettext import gettext as _

In [5]:
name_correction_map = {
    'россия': 'RUS',
    'великобритания': 'GBR',
    'венесуэла': 'VEN',
    'лаос': 'LAO',
    'оаэ': 'ARE',
    'сша': 'USA',
    'тайвань': 'TWN',
    'турция': 'TUR',
    'юар': 'ZAF',
    'южная корея': 'KOR',
}


def map_country_code(row):
    try:
        foreign = gettext.translation(
            'iso3166-1', pycountry.LOCALES_DIR, languages=['ru'])
        foreign.install()
        _ = foreign.gettext
        for english_country in pycountry.countries:
            country_name = row['country'].lower()
            if name_correction_map.get(row['country'].lower()):
                return name_correction_map[row['country'].lower()]
            foreign_country = _(english_country.name).lower()
            if foreign_country == country_name:
                return english_country.alpha_3
    except Exception as e:
        print(e)
        return None

In [6]:
df['iso_alpha'] = df.apply(map_country_code, axis=1) 

In [7]:
df = df.sort_values(by=['country', 'year', 'month' ])
df

Unnamed: 0,country,year,month,sales,iso_alpha
15,Австралия,2014,1,82285,AUS
71,Австралия,2014,2,86818,AUS
131,Австралия,2014,3,97267,AUS
191,Австралия,2014,4,80710,AUS
248,Австралия,2014,5,94562,AUS
...,...,...,...,...,...
6757,Япония,2023,5,322886,JPN
6811,Япония,2023,6,387445,JPN
6866,Япония,2023,7,374907,JPN
6921,Япония,2023,8,335873,JPN


In [8]:
df.to_csv(os.path.join(SAVE_DIR, 'cars_sales.csv'))

### Получаем ежегодные экономические показатели стран из API data.worldbank.org

In [42]:
import requests
import pandas as pd 
import os
import pycountry
import urllib.request
import zipfile
import io

In [73]:
population_url = 'https://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv'
gdp_url = 'https://api.worldbank.org/v2/en/indicator/NY.GDP.MKTP.CD?downloadformat=csv'
average_income_url = 'https://api.worldbank.org/v2/en/indicator/NY.ADJ.NNTY.KD?downloadformat=csv'

In [69]:
filehandle, _ = urllib.request.urlretrieve(population_url)
zip_file_object = zipfile.ZipFile(filehandle, 'r')
population_file_path = ''
for file_name in zip_file_object.namelist():
    if file_name.startswith("API_SP.POP"):
        population_file_path = file_name
        break

file = zip_file_object.open(population_file_path)
string_lines = [line.decode("utf-8") for line in file.readlines()[4:]]
s = io.StringIO()
s.writelines(string_lines)
population_df = pd.read_csv(io.StringIO(s.getvalue()), sep=',')
population_df.info()

data = []
for index, row in population_df.iterrows():
    for year in list(range(1960, 2023)):
        data.append(
            {
                'Country Name': row['Country Name'],
                'Country Code': row['Country Code'],
                'Year': year,
                'Population': row[str(year)],
            }
        )

final_population_df = pd.DataFrame(data)
final_population_df.to_csv(os.path.join(SAVE_DIR, 'countries_populations.csv'))
print(final_population_df.info())
final_population_df.head(50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 68 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country Name    266 non-null    object 
 1   Country Code    266 non-null    object 
 2   Indicator Name  266 non-null    object 
 3   Indicator Code  266 non-null    object 
 4   1960            264 non-null    float64
 5   1961            264 non-null    float64
 6   1962            264 non-null    float64
 7   1963            264 non-null    float64
 8   1964            264 non-null    float64
 9   1965            264 non-null    float64
 10  1966            264 non-null    float64
 11  1967            264 non-null    float64
 12  1968            264 non-null    float64
 13  1969            264 non-null    float64
 14  1970            264 non-null    float64
 15  1971            264 non-null    float64
 16  1972            264 non-null    float64
 17  1973            264 non-null    flo

Unnamed: 0,Country Name,Country Code,Year,Population
0,Aruba,ABW,1960,54608.0
1,Aruba,ABW,1961,55811.0
2,Aruba,ABW,1962,56682.0
3,Aruba,ABW,1963,57475.0
4,Aruba,ABW,1964,58178.0
5,Aruba,ABW,1965,58782.0
6,Aruba,ABW,1966,59291.0
7,Aruba,ABW,1967,59522.0
8,Aruba,ABW,1968,59471.0
9,Aruba,ABW,1969,59330.0


In [72]:
filehandle, _ = urllib.request.urlretrieve(gdp_url)
zip_file_object = zipfile.ZipFile(filehandle, 'r')
population_file_path = ''
for file_name in zip_file_object.namelist():
    if file_name.startswith("API_NY.GDP"):
        population_file_path = file_name
        break

file = zip_file_object.open(population_file_path)
string_lines = [line.decode("utf-8") for line in file.readlines()[4:]]
s = io.StringIO()
s.writelines(string_lines)
gdp_df = pd.read_csv(io.StringIO(s.getvalue()), sep=',')

data = []
for index, row in gdp_df.iterrows():
    for year in list(range(1960, 2023)):
        data.append(
            {
                'Country Name': row['Country Name'],
                'Country Code': row['Country Code'],
                'Year': year,
                'GDP': row[str(year)],
            }
        )

final_gdp_df = pd.DataFrame(data)
final_gdp_df.to_csv(os.path.join(SAVE_DIR, 'countries_gdp.csv'))
print(final_gdp_df.info())
final_gdp_df.head(50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16758 entries, 0 to 16757
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country Name  16758 non-null  object 
 1   Country Code  16758 non-null  object 
 2   Year          16758 non-null  int64  
 3   GDP           13200 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 523.8+ KB
None


Unnamed: 0,Country Name,Country Code,Year,GDP
0,Aruba,ABW,1960,
1,Aruba,ABW,1961,
2,Aruba,ABW,1962,
3,Aruba,ABW,1963,
4,Aruba,ABW,1964,
5,Aruba,ABW,1965,
6,Aruba,ABW,1966,
7,Aruba,ABW,1967,
8,Aruba,ABW,1968,
9,Aruba,ABW,1969,


In [79]:
filehandle, _ = urllib.request.urlretrieve(average_income_url)
zip_file_object = zipfile.ZipFile(filehandle, 'r')
population_file_path = ''
for file_name in zip_file_object.namelist():
    if file_name.startswith("API_NY.ADJ"):
        population_file_path = file_name
        break

file = zip_file_object.open(population_file_path)
string_lines = [line.decode("utf-8") for line in file.readlines()[4:]]
s = io.StringIO()
s.writelines(string_lines)
average_income_df = pd.read_csv(io.StringIO(s.getvalue()), sep=',')

data = []
for index, row in average_income_df.iterrows():
    for year in list(range(1960, 2022)):
        data.append(
            {
                'Country Name': row['Country Name'],
                'Country Code': row['Country Code'],
                'Year': year,
                'Average Income': row[str(year)],
            }
        )

final_average_income_df = pd.DataFrame(data)
final_average_income_df.to_csv(os.path.join(SAVE_DIR, 'countries_average_income.csv'))
print(final_average_income_df.info())
final_average_income_df.tail(50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16492 entries, 0 to 16491
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country Name    16492 non-null  object 
 1   Country Code    16492 non-null  object 
 2   Year            16492 non-null  int64  
 3   Average Income  6006 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 515.5+ KB
None


Unnamed: 0,Country Name,Country Code,Year,Average Income
16442,Zimbabwe,ZWE,1972,
16443,Zimbabwe,ZWE,1973,
16444,Zimbabwe,ZWE,1974,
16445,Zimbabwe,ZWE,1975,
16446,Zimbabwe,ZWE,1976,
16447,Zimbabwe,ZWE,1977,
16448,Zimbabwe,ZWE,1978,
16449,Zimbabwe,ZWE,1979,
16450,Zimbabwe,ZWE,1980,
16451,Zimbabwe,ZWE,1981,
