In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import random
from math import sqrt

## Data Processing

In [None]:
dpath = '/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv'

In [None]:
# Reading
df = pd.read_csv(dpath)
df.head()

#### Dataset column description
```
1 - fixed acidity

2 - volatile acidity

3 - citric acid

4 - residual sugar

5 - chlorides

6 - free sulfur dioxide

7 - total sulfur dioxide

8 - density

9 - pH

10 - sulphates

11 - alcohol

Output variable (based on sensory data):

12 - quality (score between 0 and 10)

```

In [None]:
df.describe()

In [None]:
# Count NA / Missing Values
df.isnull().sum()

## Linear Regression Implementation From Stratch

In [None]:
def mean(values):
    return sum(values) * 1.0 / len(values)

def variance(values, mean):
    return sum([(x - mean) ** 2 for x in values])

def covariance(x, mean_x, y, mean_y):
    return sum((x[i] - mean_x) * (y[i] - mean_y) for i in range(len(x)))

def coefficients(dataset):
    # y = b0 + b1 * x
    # where b1 = cov(x, y) / var(x)
    # b0 = mean_y - cov(x, y) * mean_x
    
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    
    mean_x, mean_y = mean(x), mean(y)
    var_x, var_y = variance(x, mean_x), variance(y, mean_y)
    cov_xy = covariance(x, mean_x, y, mean_y)
    
    b1 = cov_xy / var_x
    b0 = mean_y - b1 * mean_x
    
    return b0, b1


def simple_linear_regression(train, test):
    b0, b1 = coefficients(train)
    predictions = []
    actual = []
    for row in test:
        y_pred = b0 + b1 * row[0]
        predictions.append(y_pred)
        actual.append(row[1])
    return predictions, actual

def train_test_split(df):
    indexes = list(range(len(df)))
    random.shuffle(indexes)
    train = []
    test = []
    for i in range(len(df)):
        if i in indexes[:-100]:
            train.append(df[i])
        else:
            test.append(df[i])
    return train, test

def rmse_metric(actual, predicted):
    error = sum((b - a) ** 2 for a,b in zip(actual, predicted))
    n = len(actual)
    rmse = sqrt(error * 1.0 / n)
    return rmse

def evaluator(df, in_col, out_col):
    df_data = list(zip(df[in_col], df[out_col]))
    train, test = train_test_split(df_data)
    
    # Training & Prediction
    pred, actual = simple_linear_regression(train, test)
    
    # Metric
    rmse = rmse_metric(actual, pred)
    
    return rmse

In [None]:
df_in = df.copy(deep = True)
df_in.head()

In [None]:
df_in.columns

In [None]:
evaluator(df_in, 'pH', 'quality')