In [18]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the data using read_html
url = "https://uk.wikipedia.org/wiki/%D0%9D%D0%B0%D1%80%D0%BE%D0%B4%D0%B6%D1%83%D0%B2%D0%B0%D0%BD%D1%96%D1%81%D1%82%D1%8C_%D0%B2_%D0%A3%D0%BA%D1%80%D0%B0%D1%97%D0%BD%D1%96"
tables = pd.read_html(url)

# Select the desired table
df = tables[0]

# Rename columns for corresponding years
columns_rename = {str(year): f"Рік_{year}" for year in range(1950, 2020)}
df.rename(columns=columns_rename, inplace=True)

# Replace "—" with NaN
df.replace("—", pd.NA, inplace=True)

# Convert non-numeric columns to numeric
non_numeric_columns = df.columns[df.dtypes == object]
df[non_numeric_columns] = df[non_numeric_columns].apply(pd.to_numeric, errors="coerce")

# Calculate missing percentages
missing_percentages = df.isnull().sum() / df.shape[0] * 100
print("Missing percentages in each column:")
print(missing_percentages)

# Remove rows for the entire country and the last row
df = df.iloc[:-1, :]

# Fill missing values with column means
df.fillna(df.mean(), inplace=True)

# Get the list of regions with birth rates higher than the mean in 2019
year_column = 'Рік_2019'
regions_higher_than_average = df[df[year_column] > df[year_column].mean()]['Регіон'].tolist()
print(f"Regions with birth rates higher than the mean in {year_column}:")
print(regions_higher_than_average)

# Find the region with the highest birth rate in 2014
region_highest_birthrate_2014 = df[df['Рік_2014'] == df['Рік_2014'].max()]['Регіон'].values[0]
print("Region with the highest birth rate in 2014:", region_highest_birthrate_2014)

# Build a bar chart of birth rates by regions in 2019
df.plot(kind='bar', x='Регіон', y=year_column, figsize=(12, 6))
plt.title(f"Birth Rates by Regions in {year_column}")
plt.xlabel("Region")
plt.ylabel("Birth Rate")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


Missing percentages in each column:
Рік         100.000000
1860        100.000000
1896        100.000000
1906        100.000000
1913        100.000000
1925         66.666667
1940         66.666667
Рік_1950      0.000000
Рік_1955     33.333333
Рік_1960      0.000000
Рік_1965     33.333333
Рік_1970      0.000000
dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna(df.mean(), inplace=True)


KeyError: 'Рік_2019'