In [41]:
import pandas as pd
import seaborn as ns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LinearRegression

In [58]:

wine_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
wine_df = pd.read_csv(wine_url, sep=';')

In [59]:
# Separate features and target before transformation
X_raw = wine_df.drop('quality', axis=1)
y = wine_df['quality']

# Apply transformations only to features
numeric_transform_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy="median")),
    ('now_scale', StandardScaler())
])

numeric_transforms_to_columns = ColumnTransformer(transformers=[
    ('numeric_col_transforms', numeric_transform_pipeline, numeric_columns_to_scaler)
], remainder='passthrough')

# Transform only the feature data
X_transformed = numeric_transforms_to_columns.fit_transform(X_raw)
X_transformed_df = pd.DataFrame(X_transformed, 
                              columns=numeric_transforms_to_columns.get_feature_names_out())

# Fit the model using transformed features and original target
lr_model = LinearRegression()
model_estimate = lr_model.fit(X_transformed, y)

# Calculate R-squared correctly
r2 = sklearn.metrics.r2_score(y, lr_model.predict(X_transformed))

In [60]:
r2

0.360551703038688

In [35]:
numeric_columns_to_scaler = ['fixed acidity', 'residual sugar', 'free sulfur dioxide', 'pH', 'alcohol']
wine_df[numeric_columns_to_scaler].head()

Unnamed: 0,fixed acidity,residual sugar,free sulfur dioxide,pH,alcohol
0,7.4,1.9,11.0,3.51,9.4
1,7.8,2.6,25.0,3.2,9.8
2,7.8,2.3,15.0,3.26,9.8
3,11.2,1.9,17.0,3.16,9.8
4,7.4,1.9,11.0,3.51,9.4


In [36]:
numeric_transform_pipeline = Pipeline(steps=
                              [
                                  ('impute', SimpleImputer(strategy="median")),
                                  ('now_scale', StandardScaler() )
                              ]
                              )
numeric_transforms_to_columns = ColumnTransformer(transformers=[
    ('numeric_col_transforms',numeric_transform_pipeline, numeric_columns_to_scaler )
], remainder='passthrough')

In [37]:
out = pd.DataFrame(numeric_transforms_to_columns.fit_transform(wine_df), columns=numeric_transforms_to_columns.get_feature_names_out())

In [38]:

out.head()

Unnamed: 0,numeric_col_transforms__fixed acidity,numeric_col_transforms__residual sugar,numeric_col_transforms__free sulfur dioxide,numeric_col_transforms__pH,numeric_col_transforms__alcohol,remainder__volatile acidity,remainder__citric acid,remainder__chlorides,remainder__total sulfur dioxide,remainder__density,remainder__sulphates,remainder__quality
0,-0.52836,-0.453218,-0.466193,1.288643,-0.960246,0.7,0.0,0.076,34.0,0.9978,0.56,5.0
1,-0.298547,0.043416,0.872638,-0.719933,-0.584777,0.88,0.0,0.098,67.0,0.9968,0.68,5.0
2,-0.298547,-0.169427,-0.083669,-0.331177,-0.584777,0.76,0.04,0.092,54.0,0.997,0.65,5.0
3,1.654856,-0.453218,0.107592,-0.979104,-0.584777,0.28,0.56,0.075,60.0,0.998,0.58,6.0
4,-0.52836,-0.453218,-0.466193,1.288643,-0.960246,0.7,0.0,0.076,34.0,0.9978,0.56,5.0


In [43]:
out.columns

Index(['numeric_col_transforms__fixed acidity',
       'numeric_col_transforms__residual sugar',
       'numeric_col_transforms__free sulfur dioxide',
       'numeric_col_transforms__pH', 'numeric_col_transforms__alcohol',
       'remainder__volatile acidity', 'remainder__citric acid',
       'remainder__chlorides', 'remainder__total sulfur dioxide',
       'remainder__density', 'remainder__sulphates', 'remainder__quality'],
      dtype='object')

In [62]:
lr_model = LinearRegression()
X = out[['numeric_col_transforms__fixed acidity',
       'numeric_col_transforms__residual sugar',
       'numeric_col_transforms__free sulfur dioxide',
       'numeric_col_transforms__pH', 'numeric_col_transforms__alcohol',
       'remainder__volatile acidity', 'remainder__citric acid',
       'remainder__chlorides', 'remainder__total sulfur dioxide',
       'remainder__density', 'remainder__sulphates']].to_numpy()
Y = out[['remainder__quality']].to_numpy()

In [63]:
model_esimate = lr_model.fit(X, Y)

In [64]:
model_esimate.coef_

array([[ 4.34973514e-02,  2.30187143e-02,  4.56059635e-02,
        -6.38424740e-02,  2.94242883e-01, -1.08359026e+00,
        -1.82563948e-01, -1.87422516e+00, -3.26457970e-03,
        -1.78811638e+01,  9.16334413e-01]])

In [65]:
from sklearn.metrics import r2_score, root_mean_squared_error

In [66]:
root_mean_squared_error(model_esimate.predict(X), Y)

0.6455750670692048

In [67]:
import sklearn
import sklearn.metrics

sklearn.metrics.r2_score(Y, model_esimate.predict(X))

0.360551703038688

In [72]:
import torch.nn.functional as F
help(sklearn.metrics.root_mean_squared_error)

Help on function root_mean_squared_error in module sklearn.metrics._regression:

root_mean_squared_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')
    Root mean squared error regression loss.

    Read more in the :ref:`User Guide <mean_squared_error>`.

    .. versionadded:: 1.4

    Parameters
    ----------
    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Ground truth (correct) target values.

    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Estimated target values.

    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.

    multioutput : {'raw_values', 'uniform_average'} or array-like of shape             (n_outputs,), default='uniform_average'
        Defines aggregating of multiple output values.
        Array-like value defines weights used to average errors.

        'raw_values' :
            Returns a full set of errors in case of multioutput input.



In [74]:
import torch.nn as nn
help(nn.NLLLoss)

Help on class NLLLoss in module torch.nn.modules.loss:

class NLLLoss(_WeightedLoss)
 |  NLLLoss(weight: Optional[torch.Tensor] = None, size_average=None, ignore_index: int = -100, reduce=None, reduction: str = 'mean') -> None
 |
 |  The negative log likelihood loss. It is useful to train a classification
 |  problem with `C` classes.
 |
 |  If provided, the optional argument :attr:`weight` should be a 1D Tensor assigning
 |  weight to each of the classes. This is particularly useful when you have an
 |  unbalanced training set.
 |
 |  The `input` given through a forward call is expected to contain
 |  log-probabilities of each class. `input` has to be a Tensor of size either
 |  :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)`
 |  with :math:`K \geq 1` for the `K`-dimensional case. The latter is useful for
 |  higher dimension inputs, such as computing NLL loss per-pixel for 2D images.
 |
 |  Obtaining log-probabilities in a neural network is easily achieved by
 | 