In [1]:
# Let's first load and inspect the dataset to understand its structure and begin preprocessing.
import pandas as pd

# Load the dataset
file_path = '/mnt/data/Consumo_cerveja.csv'
df = pd.read_csv('Consumo_cerveja.csv')

# Show the first few rows of the dataset to get an overview
df.head(), df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 941 entries, 0 to 940
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Data                         365 non-null    object 
 1   Temperatura Media (C)        365 non-null    object 
 2   Temperatura Minima (C)       365 non-null    object 
 3   Temperatura Maxima (C)       365 non-null    object 
 4   Precipitacao (mm)            365 non-null    object 
 5   Final de Semana              365 non-null    float64
 6   Consumo de cerveja (litros)  365 non-null    float64
dtypes: float64(2), object(5)
memory usage: 51.6+ KB


(         Data Temperatura Media (C) Temperatura Minima (C)  \
 0  2015-01-01                  27,3                   23,9   
 1  2015-01-02                 27,02                   24,5   
 2  2015-01-03                 24,82                   22,4   
 3  2015-01-04                 23,98                   21,5   
 4  2015-01-05                 23,82                     21   
 
   Temperatura Maxima (C) Precipitacao (mm)  Final de Semana  \
 0                   32,5                 0              0.0   
 1                   33,5                 0              0.0   
 2                   29,9                 0              1.0   
 3                   28,6               1,2              1.0   
 4                   28,3                 0              0.0   
 
    Consumo de cerveja (litros)  
 0                       25.461  
 1                       28.972  
 2                       30.814  
 3                       29.799  
 4                       28.900  ,
 None)

In [2]:
# Preprocessing

# Step 1: Replace commas with periods in numeric columns and convert to float
numeric_columns = ['Temperatura Media (C)', 'Temperatura Minima (C)', 'Temperatura Maxima (C)', 'Precipitacao (mm)']
df[numeric_columns] = df[numeric_columns].replace({',': '.'}, regex=True).astype(float)

# Step 2: Check for missing values
missing_values = df.isnull().sum()

# Dropping rows with missing values for simplicity (since the majority of rows are complete)
df_cleaned = df.dropna()

# Convert the 'Data' column to datetime format
df_cleaned['Data'] = pd.to_datetime(df_cleaned['Data'], format='%Y-%m-%d')

# Display the cleaned data and missing value summary
df_cleaned.head(), missing_values


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Data'] = pd.to_datetime(df_cleaned['Data'], format='%Y-%m-%d')


(        Data  Temperatura Media (C)  Temperatura Minima (C)  \
 0 2015-01-01                  27.30                    23.9   
 1 2015-01-02                  27.02                    24.5   
 2 2015-01-03                  24.82                    22.4   
 3 2015-01-04                  23.98                    21.5   
 4 2015-01-05                  23.82                    21.0   
 
    Temperatura Maxima (C)  Precipitacao (mm)  Final de Semana  \
 0                    32.5                0.0              0.0   
 1                    33.5                0.0              0.0   
 2                    29.9                0.0              1.0   
 3                    28.6                1.2              1.0   
 4                    28.3                0.0              0.0   
 
    Consumo de cerveja (litros)  
 0                       25.461  
 1                       28.972  
 2                       30.814  
 3                       29.799  
 4                       28.900  ,
 Data      

In [3]:
# Summary statistics and correlation matrix for EDA

# Step 1: Summary statistics
summary_stats = df_cleaned.describe()

# Step 2: Correlation matrix to understand the relationships
correlation_matrix = df_cleaned.corr()

summary_stats, correlation_matrix


(                      Data  Temperatura Media (C)  Temperatura Minima (C)  \
 count                  365             365.000000              365.000000   
 mean   2015-07-02 00:00:00              21.226356               17.461370   
 min    2015-01-01 00:00:00              12.900000               10.600000   
 25%    2015-04-02 00:00:00              19.020000               15.300000   
 50%    2015-07-02 00:00:00              21.380000               17.900000   
 75%    2015-10-01 00:00:00              23.280000               19.600000   
 max    2015-12-31 00:00:00              28.860000               24.500000   
 std                    NaN               3.180108                2.826185   
 
        Temperatura Maxima (C)  Precipitacao (mm)  Final de Semana  \
 count              365.000000         365.000000       365.000000   
 mean                26.611507           5.196712         0.284932   
 min                 14.500000           0.000000         0.000000   
 25%            

In [4]:
# Importing necessary libraries for building a neural network model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Selecting features (X) and target (y)
X = df_cleaned[['Temperatura Media (C)', 'Temperatura Minima (C)', 'Temperatura Maxima (C)', 'Precipitacao (mm)', 'Final de Semana']]
y = df_cleaned['Consumo de cerveja (litros)']

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data (important for neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build a neural network model using MLPRegressor
mlp = MLPRegressor(hidden_layer_sizes=(64, 32), activation='relu', max_iter=500, random_state=42)
mlp.fit(X_train_scaled, y_train)

# Predictions and evaluation
y_train_pred = mlp.predict(X_train_scaled)
y_test_pred = mlp.predict(X_test_scaled)

# Evaluate the model using Mean Squared Error (MSE) and R-squared (R²)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

train_mse, test_mse, train_r2, test_r2




(5.06268442858314, 6.241977065445645, 0.7237133130595612, 0.7183106119161324)