In [None]:

# Treinamento e Avaliação de Modelo - Previsão da Nota de Matemática

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Carregar os dados
df = pd.read_csv("StudentsPerformance.csv")

# Selecionar variáveis
X = df.drop(columns=["math score"])
y = df["math score"]

# Codificar variáveis categóricas
categorical_cols = X.select_dtypes(include="object").columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(drop="first"), categorical_cols)],
    remainder="passthrough"
)

# Separar treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Treinar modelo
model = LinearRegression()
X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc = preprocessor.transform(X_test)
model.fit(X_train_enc, y_train)

# Avaliar
y_pred = model.predict(X_test_enc)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.2f}")

# Comparação visual
plt.figure(figsize=(10, 5))
sns.lineplot(x=range(len(y_test)), y=y_test.values, label='Real')
sns.lineplot(x=range(len(y_pred)), y=y_pred, label='Previsto')
plt.title("Notas Reais vs Previstas - Matemática")
plt.xlabel("Amostras")
plt.ylabel("Nota")
plt.legend()
plt.tight_layout()
plt.show()
