# In this practice session, we will learn to code Multiple Linear Regression.
# We will perform the following steps to build a Multiple Linear Regression model using the popular Beer dataset.

 
 
  - **Data Preprocessing**

    - Importing the libraries.
    - Importing dataset.
    - Dealing with the categorical variable.
    - Classifying dependent and independent variables.
    - Splitting the data into a training set and test set.
    - Feature scaling.
 

  -  **Linear Regression**

    - Create a Linear Regressor.
    - Feed the training data to the regressor model.
    - Predicting the scores for the test set.
    - Using the RMSE to measure the performance.

In [None]:
!python -m pip install pip --upgrade --user -q
!python -m pip install numpy pandas seaborn matplotlib scipy statsmodels sklearn --user -q

In [None]:
import IPython
IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
#1 Importing necessary libraries
import numpy as np
import pandas as pd

In [None]:
#2 Importing the data set
dataset = pd.read_csv('beer_data.csv')

#Printing first 10 rows of the dataset
print("\n",'-'*40,"\n",dataset.head(10))

In [None]:
#3 Dealing with the categorical data

#spliting Cellar Temperature into Maximum and Minimum based on the given data and converting the type from str to int
dataset['Minimum_Cellar_Temp'] = dataset['Cellar Temperature'].apply(lambda x : int(x.split('-')[0].strip()))
dataset['Maximum_Cellar_Temp'] = dataset['Cellar Temperature'].apply(lambda x : int(x.split('-')[1].strip()))

#New dataset with selected features
dataset = dataset[['ABV', 'Ratings','Minimum_Cellar_Temp','Maximum_Cellar_Temp', 'Score']]

#Printing first 10 rows of the dataset
print("\n----------------------------\n",dataset.head(10))

#Printing the summary of the dataset
print("\n----------------------------\n")
print(dataset.info())

In [None]:
#A simple correlation plot usong seaborn. The below plot shows how the different variables correlate with each other
import seaborn as sns
import matplotlib.pyplot as plt
corr = dataset.corr()
fig, ax = plt.subplots(figsize=(10,10))
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    square=True,
    annot=True,
    linewidths=.5,
    cmap="YlGnBu" )

#Rotating labels on x axis
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=35,
    horizontalalignment='right'
)

In [None]:
#4 Classifying dependent and independent variables

#All columns except the last column are independent features- (Selecting every column except Score)
X = dataset.iloc[:,:-1].values

#Only the last column is the dependent feature or the target variable(Score)
y = dataset.iloc[:,-1].values

In [None]:
#5 Creating training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

#################Data Preprocessing Ends #################################

In [None]:
print("\n\nTraining Set :\n----------------\n")
print("X = \n", X_train)
print("y = \n", y_train)

print("\n\nTest Set :\n----------------\n")
print("X = \n",X_test)
print("y = \n", y_test)

In [None]:
""" Multiple Linear regression """

#6 Creating the Regressor and training it with the training set

from sklearn.linear_model import LinearRegression
regressor = LinearRegression(normalize = True)

In [None]:
#7 Feeding the data and training the model
regressor.fit(X_train,y_train)

In [None]:
#8 Predicting the Score for test set observations
y_pred = regressor.predict(X_test)

In [None]:
#printing the predictions
print("\n----------------------------\nPredictions = \n",y_pred)

In [None]:
#9 Calculating score from Root Mean Log Squared Error

def rmlse(y_test, y_pred):
    error = np.square(np.log10(y_pred +1) - np.log10(y_test +1)).mean() ** 0.5
    score = 1 - error
    return error, score

error, score = rmlse(y_test, y_pred)

print("\n",'-'*40)
print("RMLSE : ", error)
print("Score : ", score)