In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# loading dataset
wine = pd.read_csv("../input/wine-quality/winequalityN.csv")
print(wine.shape)
print(wine.head())

In [None]:
# checking for duplicate rows
# getting duplicated rows except the first one based on all columns
wine_duplicates = wine[wine.duplicated()]
print(wine_duplicates)

In [None]:
# dropping duplicates
wine.drop_duplicates(inplace=True)
wine.shape

In [None]:
# checking for missing values
wine.isnull().head(20)

In [None]:
# dropping rows with missing values
wine.dropna(axis=0, inplace=True)
wine.shape

In [None]:
# checking the datatypes
wine.dtypes

In [None]:
# summary stats of numrical columns
wine.describe()

In [None]:
# loading packages
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
# creating a box plot of type and quality
sns.set_style("whitegrid")
sns.boxplot(x = 'type', y = 'quality', data = wine)
plt.xlabel('type')
plt.ylabel('quality')
plt.show()

In [None]:
# histogram plot numerical columns
def plot_histogram(df, cols, bins=4):
    for col in cols:
        fig = plt.figure(figsize=(8,8))
        ax= fig.gca()
        wine[col].plot.hist(ax = ax, bins = bins, color = 'blue')
        ax.set_title('Histogram of ' + col)
        ax.set_xlabel(col)
        ax.set_ylabel('Number')
        plt.show()
num_cols = ['fixed acidity','volatile acidity','citric acid','residual sugar', 
            'chlorides','free sulfur dioxide', 'total sulfur dioxide', 'density',
            'pH', 'sulphates', 'alcohol']
plot_histogram(wine, num_cols)

In [None]:
# histogram of numerical columns
def hist_plot(vals, lab):
    ## Distribution plot of values    
    sns.displot(vals)
    plt.title('Histogram of ' + lab)
    plt.xlabel('Value')
    plt.ylabel('Density')
    

for col in num_cols:
    hist_plot(wine[col], col)

In [None]:
# scaling the numerical features
# min-max scale the data between 0 and 1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
wine[num_cols] = scaler.fit_transform(wine[num_cols])
wine[num_cols].head()

In [None]:
wine.head()

In [None]:
# histogram of scaled numerical columns
def hist_plot(vals, lab):
    ## Distribution plot of values    
    sns.displot(vals)
    plt.title('Histogram of scaled ' + lab)
    plt.xlabel('Value')
    plt.ylabel('Density')
    

for col in num_cols:
    hist_plot(wine[col], col)

In [None]:
# transforming the numerical columns by using logarithm
for col in num_cols:
    wine[col] = np.log1p(wine[col])
# visualizing the distribution of log numerical columns
    hist_plot(wine[col], col)

In [None]:
wine.head()

In [None]:
from sklearn.cluster import KMeans
x = wine.loc[:, num_cols]
x.head()

In [None]:
# Create cluster feature
kmeans = KMeans(n_clusters=6)
x["Cluster"] = kmeans.fit_predict(x)
x["Cluster"] = x["Cluster"].astype("category")

x.head()

In [None]:
# boxplot with wine quality
x["quality"] = wine["quality"]
sns.catplot(x="quality", y="Cluster", data=x, kind="boxen", height=6)

In [None]:

from sklearn.decomposition import PCA

# Create principal components
pca = PCA()
X_pca = pca.fit_transform(x[num_cols])

# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

X_pca.head()

In [None]:
loadings = pd.DataFrame(
    pca.components_.T,  # transpose the matrix of loadings
    columns=component_names,  # so the columns are the principal components
    index=x[num_cols].columns,  # and the rows are the original features
)
loadings

In [None]:
# getting y
Xdf = wine.copy()
y = Xdf.pop('quality')
# mutual info for components
from sklearn.feature_selection import mutual_info_regression
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores
mi_scores = make_mi_scores(X_pca, y, discrete_features=False)
mi_scores

In [None]:
# attaching cluster to the wine data frame
wine['Cluster'] = x['Cluster']
num_cols = ['fixed acidity','volatile acidity','citric acid','residual sugar', 
            'chlorides','free sulfur dioxide', 'total sulfur dioxide', 'density',
            'pH', 'sulphates', 'alcohol', 'Cluster']

In [None]:
# mutual information for the columns
mi_scores = make_mi_scores(wine[num_cols], y, discrete_features=False)
mi_scores

In [None]:
# creating a scatter plot
def scatter_plot(df, cols, col_y):
    for col in cols:
        fig = plt.figure(figsize = (7,6))
        ax = fig.gca()
        wine.plot.scatter(x = col, y = col_y, ax = ax)
        ax.set_title('Scatter plot of ' + col_y + ' vs ' + col)
        ax.set_xlabel(col)
        ax.set_ylabel(col_y)
        plt.show()
scatter_plot(wine, num_cols, 'quality')

In [None]:
# creating a correlation matrix
# get correlations
wine_corr = wine.corr()

In [None]:
wine_corr

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
# mask
mask = np.triu(np.ones_like(wine_corr, dtype=np.bool))
# adjust mask and df
mask = mask[1:, :-1]
corr = wine_corr.iloc[1:,:-1].copy()
# plot heatmap
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap='Blues',
           vmin=-1, vmax=1, cbar_kws={"shrink": .8})
# yticks
plt.yticks(rotation=0)
plt.show()

In [None]:
# checking for skewness of the label(quality)
# histogram of sale price
def hist_plot(vals, lab):
    ## Distribution plot of values
    sns.displot(vals)
    plt.title('Histogram of ' + lab)
    plt.xlabel('Value')
    plt.ylabel('Density')
    
#labels = np.array(auto_prices['price'])
hist_plot(wine['quality'], 'Quality')

In [None]:
# Linear regression
# loading packages
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
import numpy.random as nr
import scipy.stats as ss
import math

In [None]:
# preparing the model matrix for train dataset
# creating dummy variables from categorical variables
##  encode the strings to numeric categories
enc = preprocessing.LabelEncoder()
enc.fit(wine['type'])
enc_cat_feature = enc.transform(wine['type'])
##  apply one hot encoding
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(enc_cat_feature.reshape(-1,1))
Features = encoded.transform(enc_cat_feature.reshape(-1,1)).toarray()
print(Features.shape)
Features[:2, :]

In [None]:
# adding numerical variables for the train dataset
num_features = wine[['fixed acidity','volatile acidity','citric acid','residual sugar', 
            'chlorides','free sulfur dioxide', 'total sulfur dioxide', 'density',
             'sulphates', 'alcohol', 'Cluster']]
Features = np.concatenate([Features, np.array(num_features)], axis = 1)
print(Features.shape)
Features[:2,:]


In [None]:
# creating a numpy array of label values
label = np.array(wine['quality'])
label

In [None]:
# splitting the dataset
## Randomly sample cases to create independent training and test data
nr.seed(9988)
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 200)
x_train = Features[indx[0],:]
y_train = np.ravel(label[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(label[indx[1]])

In [None]:
# constructing the linear regression model
## define and fit the linear regression model
lin_mod = linear_model.LinearRegression(fit_intercept = False)
lin_mod.fit(x_train, y_train)

In [None]:
print(lin_mod.intercept_)
print(lin_mod.coef_)

In [None]:
# test scores
y_score = lin_mod.predict(x_test)
y_score

In [None]:
# evaluating the model
def print_metrics(y_true, y_predicted):
    ## First compute R^2 
    r2 = sklm.r2_score(y_true, y_predicted)
  
    
    ## Print the usual metrics and the R^2 values
    print('Mean Square Error      = ' + str(sklm.mean_squared_error(y_true, y_predicted)))
    print('Root Mean Square Error = ' + str(math.sqrt(sklm.mean_squared_error(y_true, y_predicted))))
    print('Mean Absolute Error    = ' + str(sklm.mean_absolute_error(y_true, y_predicted)))
    print('Median Absolute Error  = ' + str(sklm.median_absolute_error(y_true, y_predicted)))
    print('R^2                    = ' + str(r2))

   
print_metrics(y_test, y_score)

In [None]:
def hist_resids(y_test, y_score):
    ## first compute vector of residuals. 
    resids = np.subtract(y_test.reshape(-1,1), y_score.reshape(-1,1))
    ## now make the residual plots
    sns.distplot(resids)
    plt.title('Histogram of residuals')
    plt.xlabel('Residual value')
    plt.ylabel('count')
    
hist_resids(y_test, y_score)

In [None]:
# residual plots vs predicted values
def resid_plot(y_test, y_score):
    ## first compute vector of residuals. 
    resids = np.subtract(y_test.reshape(-1,1), y_score.reshape(-1,1))
    ## now make the residual plots
    sns.regplot(x=y_score, y=resids, fit_reg=False)
    plt.title('Residuals vs. predicted values')
    plt.xlabel('Predicted values')
    plt.ylabel('Residual')

resid_plot(y_test, y_score) 