In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# read the dataset
df = pd.read_csv('../input/Consumo_cerveja.csv')

In [3]:
df.shape

(941, 7)

In [4]:
df.isna().sum() # find the number of missing values in the dataset

Data                           576
Temperatura Media (C)          576
Temperatura Minima (C)         576
Temperatura Maxima (C)         576
Precipitacao (mm)              576
Final de Semana                576
Consumo de cerveja (litros)    576
dtype: int64

In [5]:
""" Since there are a lot of missing values as compared to the number of observations we drop the missing rows """
df = df.dropna()

In [6]:
df.shape

(365, 7)

In [7]:
df.head()

Unnamed: 0,Data,Temperatura Media (C),Temperatura Minima (C),Temperatura Maxima (C),Precipitacao (mm),Final de Semana,Consumo de cerveja (litros)
0,2015-01-01,273,239,325,0,0.0,25.461
1,2015-01-02,2702,245,335,0,0.0,28.972
2,2015-01-03,2482,224,299,0,1.0,30.814
3,2015-01-04,2398,215,286,12,1.0,29.799
4,2015-01-05,2382,21,283,0,0.0,28.9


In [8]:
# We need to clean the dataset by replacing the ',' by '.' and convert it to float
df['Temperatura Media (C)'] = df['Temperatura Media (C)'].str.replace(',', '.').astype(float)
df['Temperatura Maxima (C)'] = df['Temperatura Maxima (C)'].str.replace(',', '.').astype(float)
df['Temperatura Minima (C)'] = df['Temperatura Minima (C)'].str.replace(',', '.').astype(float)
df['Precipitacao (mm)'] = df['Precipitacao (mm)'].str.replace(',', '.').astype(float)

In [9]:
df.head()

Unnamed: 0,Data,Temperatura Media (C),Temperatura Minima (C),Temperatura Maxima (C),Precipitacao (mm),Final de Semana,Consumo de cerveja (litros)
0,2015-01-01,27.3,23.9,32.5,0.0,0.0,25.461
1,2015-01-02,27.02,24.5,33.5,0.0,0.0,28.972
2,2015-01-03,24.82,22.4,29.9,0.0,1.0,30.814
3,2015-01-04,23.98,21.5,28.6,1.2,1.0,29.799
4,2015-01-05,23.82,21.0,28.3,0.0,0.0,28.9


In [10]:
# Drop the data column as it is irrelevent to the dataset
df.drop(['Data'], 1, inplace = True)

In [11]:
# save the cleaned data
# df.to_csv('cleaned_beer.csv')

In [12]:
df.head()

Unnamed: 0,Temperatura Media (C),Temperatura Minima (C),Temperatura Maxima (C),Precipitacao (mm),Final de Semana,Consumo de cerveja (litros)
0,27.3,23.9,32.5,0.0,0.0,25.461
1,27.02,24.5,33.5,0.0,0.0,28.972
2,24.82,22.4,29.9,0.0,1.0,30.814
3,23.98,21.5,28.6,1.2,1.0,29.799
4,23.82,21.0,28.3,0.0,0.0,28.9


In [13]:
import statsmodels.api as sm

In [14]:
X = df.iloc[:, :-1].astype(float)
y = df['Consumo de cerveja (litros)']

In [15]:
model = sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,Consumo de cerveja (litros),R-squared:,0.991
Model:,OLS,Adj. R-squared:,0.991
Method:,Least Squares,F-statistic:,7620.0
Date:,"Sat, 02 Feb 2019",Prob (F-statistic):,0.0
Time:,08:55:49,Log-Likelihood:,-851.48
No. Observations:,365,AIC:,1713.0
Df Residuals:,360,BIC:,1732.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Temperatura Media (C),0.1192,0.202,0.590,0.555,-0.278,0.516
Temperatura Minima (C),0.1146,0.117,0.977,0.329,-0.116,0.345
Temperatura Maxima (C),0.7313,0.102,7.179,0.000,0.531,0.932
Precipitacao (mm),-0.0552,0.011,-5.112,0.000,-0.076,-0.034
Final de Semana,5.4816,0.289,18.989,0.000,4.914,6.049

0,1,2,3
Omnibus:,20.752,Durbin-Watson:,1.721
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9.729
Skew:,-0.175,Prob(JB):,0.00771
Kurtosis:,2.281,Cond. No.,85.8


From the above summary we can observe that the p-values of the independent variable are 0.555, 0.329, 0, 0, 0 and so we can eliminate the columns that have a high p-value.

In [16]:
# final dataset for applying regression
df_final = df.iloc[:, [2, 3, 4, 5]]

In [17]:
X = df_final.iloc[:, :-1]
y = df_final.iloc[:, -1]

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
# splitting into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [20]:
from sklearn.linear_model import Ridge

In [21]:
rr = Ridge(alpha=1)

In [22]:
# fitting ridge regression to the dataset
rr.fit(X_train, y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [23]:
# Predicting the values of the consumption of beer and saving it in y_pred
y_pred = rr.predict(X_test)

In [24]:
from sklearn.metrics import r2_score

In [25]:
# R-squared value for the model
r2_score(y_test, y_pred)

0.7440919037300955

In [26]:
# Root Mean squared error value for the model
rmse = np.sqrt((((y_pred) - (y_test))**2).mean())
rmse

2.3813205190408278