### Príprava dát a základné štatistické analýzy súvislostí medzi fajčením a nezamestnanosťou v európskych krajinách

. načítanie potrebných balíčkov a dát

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from scipy.stats import shapiro
import seaborn as sns
import numpy as np
    
data = pd.read_csv("Smoking prevalence by sex.csv")
data_unempl = pd.read_csv("Unemployment rate by age and sex.csv")

TRANSFORMÁCIE A SPOL

. vymazanie zbytočností

. vyčistenie datasetov s prekrývajúcimi sa hodnotami (duplikáciami) v dimenziách AGE, EDUCATION LEVEL, GEOGRAPHY, SMOKING...

. vybratie len rokov prítomných v oboch datasetoch

In [None]:
# zmazanie prebytocnych stlpcov
data = data.drop(columns=["freq", "unit"])
data_unempl = data_unempl.drop(columns=["freq"])
print(data.columns)
print(data_unempl.columns)

In [3]:
# premenovanie roznych kategorii v dimenzii veku v tabulke o zamestnani
# Zmena len pre určité riadky
data_unempl.loc[data_unempl["age"] == "Y15-74", "age"] = "Y15-64"

# premenovanie roznych kategorii v dimenzii veku v tabulke o fajceni
# Zmena len pre určité riadky 
data.loc[data["age"] == "Y_GE15", "age"] = "Y15-64"

In [4]:
# filtrovanie veku v tabulke o zamestnanosti
age_filter = ["Y15-64"]  
data_unempl_filtered = data_unempl[data_unempl["age"].isin(age_filter)]

# filtrovanie krajin
geo_filter = list(set(data["geo"]) & set(data_unempl_filtered["geo"]))
data_filtered = data[data["geo"].isin(geo_filter)]
data_unempl_filtered = data_unempl_filtered[data_unempl_filtered["geo"].isin(geo_filter)]

# ponechanie len riadkov, kde krajina nie je "EU27_2020"
data_filtered = data_filtered[data_filtered["geo"] != "EU27_2020"]
data_unempl_filtered = data_unempl_filtered[data_unempl_filtered["geo"] != "EU27_2020"]

# filtrovanie pohlavi
sex_filter = ["F", "M"]
data_filtered = data_filtered[data_filtered["sex"].isin(sex_filter)]
data_unempl_filtered = data_unempl_filtered[data_unempl_filtered["sex"].isin(sex_filter)]

# filtrovanie units v tabulke o zamestnani
unit_filter = ["PC_POP"]
data_unempl_filtered = data_unempl_filtered[data_unempl_filtered["unit"].isin(unit_filter)]

In [None]:
# filtrovanie datasetov o zamestnani na spoločné roky
smoking_numeric_columns = [col for col in data_filtered.columns if col.isdigit()]
unempl_numeric_columns = [col for col in data_unempl_filtered.columns if col.isdigit()]
common_years = list(set(smoking_numeric_columns) & set(unempl_numeric_columns))
data_unempl_filtered = data_unempl_filtered[["geo", "sex", "age"] + common_years]

# kontrola ci su vsade rovnake roky, aj v prvej tabulke
print(data_filtered.columns)
print(data_unempl_filtered.columns)



. transformácia "širokej" tabuľky na "dlhú", aby som mohla mať roky tiež v riadkoch ako ostatné premenné/dimenzie

In [6]:
# premena stlpcov s rokmi na riadky
smoking_long = data_filtered.melt(
    id_vars=["geo", "sex", "age"],  # ostavajuce stlpce
    value_vars=[str(year) for year in ["2006", "2009", "2012", "2014", "2017", "2020"]],  # roky
    var_name="year",  # nazov noveho stlpcu
    value_name="smoking_rate"  # nazov pre hodnoty noveho stlpcu
)

unemployment_long = data_unempl_filtered.melt(
    id_vars=["geo", "sex", "age"],  
    value_vars=[str(year) for year in ["2006", "2009", "2012", "2014", "2017", "2020"]], 
    var_name="year",  
    value_name="unemployment_rate")

In [None]:
# pre kontrolu
smoking_long

In [None]:
# pre kontrolu
unemployment_long

In [None]:
# zlucenie tabuliek (merge) na základe spoločných stĺpcov (geo, age, sex, year)
data_merged = pd.merge(smoking_long, unemployment_long, on=["geo", "sex", "age", "year"], how="inner")

# pre kontrolu: vystup vysledku
data_merged

### Štatistická analýza

In [None]:
# hladanie nulovych hodnot pred analyzou
print(data_merged["unemployment_rate"].isnull().sum())  # pocet NaN v nezamestnanosti
print(data_merged["smoking_rate"].isnull().sum())  # pocet NaN vo fajceni

print(data_merged["unemployment_rate"].describe())  # statistika nezamestnanosti
print(data_merged["smoking_rate"].describe())  # statistika fajcenia

In [None]:
# vymazanie nulovych hodnot
data_merged_cleaned = data_merged.dropna(subset=["unemployment_rate", "smoking_rate"])
data_merged_cleaned = data_merged_cleaned[(data_merged_cleaned["unemployment_rate"] > 0) 
                                          & (data_merged_cleaned["smoking_rate"] > 0)]


In [None]:
# histogram rozlozenia dat v "smoking_rate"
plt.figure(figsize=(4, 3))
plt.hist(data_merged_cleaned["smoking_rate"], bins=15, edgecolor="black", alpha=0.7)
plt.title(f"Smoking Rate Distribution")
plt.xlabel("Smoking Rate (%)")
plt.ylabel("Frequency")
plt.show()

# histogram rozlozenia dat v "unemployment_rate"
plt.figure(figsize=(4, 3))
plt.hist(data_merged_cleaned["unemployment_rate"], bins=15, edgecolor="black", alpha=0.7)
plt.title(f"Unemployment Rate Distribution")
plt.xlabel("Unemployment Rate (%)")
plt.ylabel("Frequency")
plt.show()

In [None]:
# testy normalnosti rozlozenia dat (ak je p> 0.05 ta je to norm. rozlozenie)
stat1, p1 = shapiro(data_merged_cleaned["smoking_rate"])
print(f"Smoking: Shapiro-Wilk Test: Statistics={stat1}, P-value={p1}")

stat2, p2 = shapiro(data_merged_cleaned["unemployment_rate"])
print(f"Unemplyment: Shapiro-Wilk Test: Statistics={stat2}, P-value={p2}")

In [15]:
# logaritmicka transformacia "unemployment_rate" s 
data_merged_cleaned["log_unemployment_rate"] = np.log(data_merged_cleaned["unemployment_rate"])

In [None]:
# histogram rozlozenia dat v "unemployment_rate"
plt.figure(figsize=(4, 3))
plt.hist(data_merged_cleaned["log_unemployment_rate"], bins=15, edgecolor="black", alpha=0.7)
plt.title(f"Unemployment Rate Distribution")
plt.xlabel("LOG_Unemployment Rate (%)")
plt.ylabel("Frequency")
plt.show()

In [None]:
# vypocet korelacie medzi nezamestnanostou a fajcenim
correlation, p_value = pearsonr(data_merged_cleaned["log_unemployment_rate"], data_merged_cleaned["smoking_rate"])
print(f"Korelácia: {correlation}, P-hodnota: {p_value}")

pozn.:
Existuje štatisticky významná, slabá až stredná pozitívna súvislosť medzi nezamestnanosťou a fajčením – vyššia nezamestnanosť môže byť spojená s vyššou mierou fajčenia.

In [159]:
data_merged_cleaned.to_csv("data_smok_unempl.csv", index=False)

In [None]:
# Scatter plot pre koreláciu
plt.figure(figsize=(8, 6))  # Nastavenie veľkosti grafu
sns.scatterplot(
    x=data_merged_cleaned["log_unemployment_rate"], 
    y=data_merged_cleaned["smoking_rate"], 
    alpha=0.7, 
    color="blue"
)

# Pridanie regresnej čiary (voliteľné)
sns.regplot(
    x=data_merged_cleaned["log_unemployment_rate"], 
    y=data_merged_cleaned["smoking_rate"], 
    scatter=False, 
    color="red", 
    ci=None, 
    line_kws={"linewidth": 1.5}
)

# Popisky grafu
plt.title(f"Korelácia medzi nezamestnanosťou a fajčením\nKorelácia: {correlation:.2f}, P-hodnota: {p_value:.2e}", fontsize=14)
plt.xlabel("Log. podiel nezamestnanosti (%)", fontsize=12)
plt.ylabel("Podiel fajčiarov (%)", fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()

# Zobrazenie grafu
plt.show()

In [None]:
# agregacia dat podla rokov (priemery pre kazdy rok)
data_by_year = data_merged_cleaned.groupby("year")[["smoking_rate", "unemployment_rate"]].mean().reset_index()

# kombinovany graf
plt.figure(figsize=(10, 6))

# liniove grafy pre nezam. a fajcenie
plt.plot(data_by_year["year"], data_by_year["smoking_rate"], label="Podiel fajčiarov (%)", marker="o", linestyle="-", linewidth=2)
plt.plot(data_by_year["year"], data_by_year["unemployment_rate"], label="Podiel nezamestnaných (%)", marker="o", linestyle="--", linewidth=2)

# scatter ploty pre presne hodnoty (body)
plt.scatter(data_by_year["year"], data_by_year["smoking_rate"], color="blue", alpha=0.7, label="Fajčenie")
plt.scatter(data_by_year["year"], data_by_year["unemployment_rate"], color="orange", alpha=0.7, label="Nezamestnanosť")

# nastavenia grafu
plt.title("Vývoj podielu fajčiarov a nezamestnanostnaných v priebehu rokov", fontsize=14)
plt.xlabel("Rok", fontsize=12)
plt.ylabel("Hodnoty (%)", fontsize=12)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
