In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df_obesity = pd.read_csv("/workspaces/relacao-pib-obesidade/obesity_cleaned.csv", decimal=".")
df_gdp = pd.read_csv("/workspaces/relacao-pib-obesidade/gdp.csv", decimal=".", delimiter=",", thousands=",")

# Limpeza dos dados
df_obesity['Year'] = pd.to_datetime(df_obesity['Year'], format='%Y').dt.year
df_gdp['Year'] = pd.to_datetime(df_gdp['Year'], format='%d/%m/%Y', errors='coerce').dt.year # Necessário que esse tratamento seja aqui por questão da ordem dos fatos
df_gdp.columns = df_gdp.columns.str.strip()
df = pd.merge(df_obesity, df_gdp, on=['Country', 'Year'])
df['GDP_pp'] = df['GDP_pp'].astype(str).str.replace(',', '').str.replace(' ', '').astype(float)
df['Obesity (%)'] = df['Obesity (%)'].str.extract(r'(\d+\.\d+)').astype(float)
df.drop(columns=['Unnamed: 0', 'Region'], inplace=True)
countries_to_include = ["United States", "Brazil", "Russia", "Germany", "United Kingdom", "France", "China"]
df = df[df['Country'].isin(countries_to_include)]
df = df.groupby(['Year', 'Country'])[['GDP_pp', 'Obesity (%)']].mean().reset_index()
df.dropna(subset=['Country', 'Year', 'Obesity (%)', 'GDP_pp'], inplace=True)

# Ensure df is not empty
if df.empty:
    raise ValueError("The dataframe 'df' is empty after processing. Please check the data and processing steps.")

# Análise dos dados com ML
X = df[['GDP_pp']]
y = df['Obesity (%)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


# Visualização dos dados
fig = px.scatter(df, x='GDP_pp', y='Obesity (%)', animation_frame='Year', animation_group='Country',
                 size='Obesity (%)', color='Country', hover_name='Country',
                 log_x=True, size_max=55, range_x=[df['GDP_pp'].min(), df['GDP_pp'].max()],
                 range_y=[df['Obesity (%)'].min(), df['Obesity (%)'].max()],
                 title='Relação entre PIB per Capita e Obesidade ao Longo dos Anos')

fig.update_layout(xaxis_title='PIB per Capita', yaxis_title='Obesidade (%)')
fig.show()