In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## LINEAR REGRESSION 
We will predict the quality of red wine on a scale of 1-10 using the input features available. We will build a liner regression model and then using this model, make predictions on unseen data.

### EDA 
Let us first import and have a look at the dataset


In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 


In [None]:
df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

Input features/variables : 
    - fixed acidity
    - volatile acidity
    - citric acid
    - residual sugar 
    - chlorides 
    - free sulfur dioxide 
    - total sulfur dioxide 
    - density
    - pH
    - sulphates
    - alcohol

Target variable :
    - quality

Let us check if any of the columns have null values 

In [None]:
# Check if any null values 
df.columns[df.isnull().any()]

Let us now check the distribution of our target variable.

In [None]:
sns.distplot(df['quality'])

As we can see, most of the training examples have their quality around 5 or 6.
Since all of the data is numeric, there is no need to separate the dataset into numeric and categorical data. 

The next step is to identify any correlation between the different input features with our target variable, quality.

In [None]:
# correlation of different input features with target variable
correlations = df.corr()['quality']

In [None]:
correlations

We can visualize this correlation with the following heatmap.

In [None]:
sns.heatmap(df.corr())

We will write a function that gives us only those features that have a correlation above a certain threshold. This will help us choose only those features that have a high correlation with our target variable and help develop a better model.


### Feature Selection

In [None]:
def extract_features(threshold):
    abs_correlation = correlations.abs()
    high_correlation = abs_correlation[abs_correlation>threshold].index.values.tolist()
    return high_correlation

In [None]:
# Extracting features with threshold 0.05
features = extract_features(0.05)
features.remove('quality')
features

We see that the extracted features contain all the input features except residual sugar.
We can now categorize the input features and dependent variable into two vectors, X and y. X will contain the extracted features. y will contain the target variable, quality.

In [None]:
X = df[features]
y = df['quality']


### Splitting the data into training and testing sets

The next step is to split the data into training and testing sets. We will use 80% of the data for training and 20% for testing. 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Model Creation

Now that we have are training and testing data, it is time to start building our Linear Regression model.

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# fitting linear regression to our dataset
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
# Coefficients of the 10 input variables
regressor.coef_

### Making predictions 

In [None]:
# Making predictions using our linear regression model 
y_pred_train = regressor.predict(X_train)
y_pred_test = regressor.predict(X_test)

### Evaluating model performance 

For the model to be accurate, the root mean squared error of the training and testing predicted values must be close to each other. Let us check this. 

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
# train RMSE 
train_rmse = mean_squared_error(y_pred_train, y_train) ** 0.5
train_rmse

In [None]:
# test RMSE 
test_rmse = mean_squared_error(y_pred_test, y_test) ** 0.5
test_rmse

As we can see, the train and test root mean squared error are close to each other indicating that the model is fairly accurate.
