Contents of this notebook:

   1. [Library loading and installation](#1)
   2. [How does the number of ratings affect the rating of a book?](#2)
   3. [Preprocessing](#3)
   4. [Ordinary Least Squares Linear Regression](#4)
   5. [Ridge Regression](#5)
   6. [Lasso](#6)
  
       6.1. [Normalizing the data](#7)
       
       6.2. [LassoLarsIC](#8)
       
       6.3. [LassoCV](#9)
       
       6.4. [LassoLarsCV](#10)
   7. [Elastic-Net](#11)

Citation

[Scikit-learn: Machine Learning in Python](http://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html), Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.

[API design for machine learning software: experiences from the scikit-learn project](https://arxiv.org/abs/1309.0238), Buitinck et al., 2013.

> # 1. Library loading and installation<a id="1"></a>

In [None]:
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from scipy.stats import powerlaw
from sklearn import linear_model
from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC, lasso_path, enet_path, BayesianRidge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix
import time
from itertools import cycle

In [None]:
pip install powerlaw

In [None]:
import powerlaw

In [None]:
books = pd.read_csv("../input/goodreadsbooks/books.csv", error_bad_lines = False)
books.rename(columns={"  num_pages":'num_pages'}, inplace=True)
books.head()

> # 2. How does the number of ratings affect the rating of a book?<a id="2"></a>

First sort the books by the number of ratings.

In [None]:
books.sort_values(by='ratings_count')

Get the maximum of the number of ratings.


In [None]:
max_ratings_count = np.max(books.ratings_count)
max_ratings_count

In [None]:
# histogram of number of ratings
ratings = books.ratings_count.loc[books.ratings_count < 10000] # >= 10000 outliers
plt.figure(figsize=(10, 5))
plt.hist(ratings, bins=10000)
plt.title('Histogram of the Number of Ratings')
plt.ylabel('Count')
plt.xlabel('Number of Ratings')
plt.show()

A sign of power law distribution. Although the maximum ratings count is 4 million, actual rating counts exceeding 10000 are insignificant.

Power Law Distribution

In [None]:
fit = powerlaw.Fit(ratings+1,xmin=1,discrete=True)
fit.power_law.plot_pdf(color= 'b',linestyle='--',label='fit ccdf')
fit.plot_pdf(color= 'b')
plt.title('Power Law Distribution of ratings_count')
plt.xlabel('log axis of ratings_count')
plt.ylabel('log axis of count')
print('alpha= ',fit.power_law.alpha,'  sigma= ',fit.power_law.sigma)

How does the flunctuation look like?

In [None]:
ave_ratings = []
for i in range(1, len(ratings)):
    rating = books.average_rating[books.ratings_count == i].values.astype(float)
    ave_ratings.append(np.average(rating))
plt.figure(figsize=(20, 10))
plt.plot(ave_ratings)
plt.title('Average Ratings for each Ratings Count')
plt.xlabel('Ratings Count')
plt.ylabel('Average Rating')
plt.show()

The ratings count does not seem to have a strong effect in the mean of the average rating. However, it does reduce the flunctuation, i.e., the variance, of the average ratings, as the ratings count increases.

In [None]:
ave_ratings = np.array(ave_ratings)
ave_ratings = ave_ratings[ave_ratings >= 0]
average_rating = np.average(ave_ratings)
print('The average rating of all books: ' + '%.2f' % average_rating)

> # 3. Preprocessing<a id='3'></a>

Only ratings_count, num_pages and text_reviews_count are taken as features.


In [None]:
# Overall fitting
X = books.loc[0:, ['ratings_count', 'num_pages', 'text_reviews_count']].values.astype(float)
y = books.average_rating.values.astype(float)

books.sort_values(by='bookID')
books = books.loc[books.ratings_count < 7000] # >= 7000 outliers

# Training and prediction
train_X = books.loc[0:8000, ['ratings_count', 'num_pages', 'text_reviews_count']].values.astype(float)
train_y = books.loc[0:8000, ['average_rating']].values.astype(float)
test_X = books.loc[8000:, ['ratings_count', 'num_pages', 'text_reviews_count']].values.astype(float)
test_y = books.loc[8000:, ['average_rating']].values.astype(float)

> # 4. Linear Regression<a id="4"></a>

In [None]:
reg = linear_model.LinearRegression()
reg.fit(X, y)

# The coefficients of each of these features.
print('Regression Coefficients:')
print(reg.coef_)
print('Interception:')
print(reg.intercept_)

In [None]:
reg.fit(train_X, train_y)

print('Regression Coefficients:')
print(reg.coef_)
print('Interception:')
print(reg.intercept_)

pred_y = reg.predict(test_X)

print('Mean Squared Error:')
print(mean_squared_error(test_y, pred_y))

print('r2 score:')
print(r2_score(test_y, pred_y))

print('Sum of Squared Errors:')
print(np.sum(np.square(pred_y - test_y)))

fig = plt.figure(figsize = (10, 7)) 
ax = plt.axes(projection ="3d") 
ax.scatter3D(test_X[:, 0], test_X[:, 1], test_y, color = "b")
plt.title("ratings_count and num_pages vs average_rating")
ax.set_xlabel("ratings_count")
ax.set_ylabel("num_pages")
ax.set_zlabel("average_rating")
ax.scatter3D(test_X[:, 0], test_X[:, 1], pred_y, color = "green")
ax.legend(['actual', 'predicting'])

plt.show()

fig = plt.figure(figsize = (10, 7)) 
ax = plt.axes(projection ="3d") 
ax.scatter3D(test_X[:, 0], test_X[:, 2], test_y, color = "b")
plt.title("ratings_count and text_reviews_count vs average_rating")
ax.set_xlabel("ratings_count")
ax.set_ylabel("text_reviews_count")
ax.set_zlabel("average_rating")
ax.scatter3D(test_X[:, 0], test_X[:, 2], pred_y, color = "green")
ax.legend(['actual', 'predicting'])

plt.show()

fig = plt.figure(figsize = (10, 7)) 
ax = plt.axes(projection ="3d") 
ax.scatter3D(test_X[:, 2], test_X[:, 1], test_y, color = "b")
plt.title("text_reivews_count and num_pages vs average_rating")
ax.set_xlabel("text_reviews_count")
ax.set_ylabel("num_pages")
ax.set_zlabel("average_rating")
ax.scatter3D(test_X[:, 2], test_X[:, 1], pred_y, color = "green")
ax.legend(['actual', 'predicting'])

plt.show()

Linear regression with features num_pages, text_reviews_count and ratings_count leaves a high variance unexplained, indicated by a low R2-score.

> # 5. Ridge Regression<a id="5"></a>

In ridge regression, penalty on the size of the coefficients is present in order to counterbalance collinearity.
See https://scikit-learn.org/stable/modules/linear_model.html#ridge-regression-and-classification for reference.

In [None]:
ran = range(-100000000000000, -999999900000) # range of alpha
ran = ran[0::10000000000] # Reduce runtime and memory.
alphas = np.logspace(-10, -2, len(ran))
coefs = []
r2s = []
mses = []

for a in ran:
    ridge = linear_model.Ridge(alpha=a, fit_intercept=False)
    ridge.fit(train_X, train_y)
    pred_y = reg.predict(test_X)
    
    coefs.append(ridge.coef_[0])
    mses.append(mean_squared_error(test_y, pred_y))
    
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.legend(['ratings_count', 'num_pages', 'text_reviews_count'])
plt.axis('tight')
plt.show()

The balance point between a presumably high collinearity and an over-dominating regularization is 10^-5.

The mean squared error and r2 do not change, however.

In [None]:
print('Mean Squared Error:')
print(mean_squared_error(test_y, pred_y))

print('r2 score:')
print(r2_score(test_y, pred_y))

print('Sum of Squared Errors:')
print(np.sum(np.square(pred_y - test_y)))

> # 6. Lasso<a id='6'><a/>

Objective function:

$min_w\frac{1}{2n}\rVert Xw-y\rVert^2+\alpha\rVert w\rVert$

In [None]:
# This is to avoid division by zero while doing np.log10
EPSILON = 1e-9

> # 6.1. Normalizing the data<a id='7'></a>

In [None]:
X /= np.sqrt(np.sum(X ** 2, axis=0))

> # 6.2. LassoLarsIC: least angle regression with BIC/AIC criterion<a id='8'></a>

In [None]:
reg_bic = LassoLarsIC(criterion='bic')
t1 = time.time()
reg_bic.fit(X, y)
t_bic = time.time() - t1
alpha_bic_ = reg_bic.alpha_

reg_aic = LassoLarsIC(criterion='aic')
reg_aic.fit(X, y)
alpha_aic_ = reg_aic.alpha_

def plot_ic_criterion(reg, name, color):
    criterion_ = reg.criterion_
    plt.semilogx(reg.alphas_ + EPSILON, criterion_, '--', color=color,
                 linewidth=3, label='%s criterion' % name)
    plt.axvline(reg.alpha_ + EPSILON, color=color, linewidth=3,
                label='alpha: %s estimate' % name)
    plt.xlabel(r'$\alpha$')
    plt.ylabel('criterion')


plt.figure()
plot_ic_criterion(reg_aic, 'AIC', 'b')
plot_ic_criterion(reg_bic, 'BIC', 'r')
plt.legend()
plt.title('Information-criterion for model selection (training time %.3fs)'
          % t_bic)
plt.show()

> # 6.3. LassoCV: coordinate descent<a id='9'></a>

In [None]:
# Compute paths
print("Computing regularization path using the coordinate descent lasso...")
t1 = time.time()
reg = LassoCV(cv=20).fit(X, y) # 20-fold cross-validation
t_lasso_cv = time.time() - t1

# Display results
plt.figure()
ymin, ymax = 0.05, 0.175
plt.semilogx(reg.alphas_ + EPSILON, reg.mse_path_, ':')
plt.plot(reg.alphas_ + EPSILON, reg.mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(reg.alpha_ + EPSILON, linestyle='--', color='k',
            label='alpha: CV estimate')

plt.legend()

plt.xlabel(r'$\alpha$')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: coordinate descent '
          '(train time: %.2fs)' % t_lasso_cv)
plt.axis('tight')
plt.ylim(ymin, ymax)
plt.show()

> # 6.4. LassoLarsCV: least angle regression <a id='10'></a>

In [None]:
# Compute paths
print("Computing regularization path using the Lars lasso...")
t1 = time.time()
reg = LassoLarsCV(cv=20).fit(X, y)
t_lasso_lars_cv = time.time() - t1

# Display results
plt.figure()
plt.semilogx(reg.cv_alphas_ + EPSILON, reg.mse_path_, ':')
plt.semilogx(reg.cv_alphas_ + EPSILON, reg.mse_path_.mean(axis=-1), 'k',
             label='Average across the folds', linewidth=2)
plt.axvline(reg.alpha_, linestyle='--', color='k',
            label='alpha CV')
plt.legend()

plt.xlabel(r'$\alpha$')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: Lars (train time: %.2fs)'
          % t_lasso_lars_cv)
plt.axis('tight')
plt.ylim(ymin, ymax)

plt.show()

> # 7. Elastic-Net<a id='11'></a>

Objective function:

$min_w\frac{1}{2n}\rVert Xw-y\rVert^2+\alpha\rho\rVert w\rVert+\frac{\alpha(1-\rho)}{2}\rVert w\rVert^2$

In [None]:
X = books.loc[0:, ['ratings_count', 'num_pages', 'text_reviews_count']].values.astype(float)
y = books.average_rating.values.astype(float)

def plot_path(X):
    X /= X.std(axis=0)  # Standardize data (easier to set the l1_ratio parameter)

    # Compute paths

    eps = 5e-9  # the smaller it is the longer is the path

    print("Computing regularization path using the lasso...")
    alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps, fit_intercept=False)

    print("Computing regularization path using the positive lasso...")
    alphas_positive_lasso, coefs_positive_lasso, _ = lasso_path(
        X, y, eps=eps, positive=True, fit_intercept=False)
    print("Computing regularization path using the elastic net...")
    alphas_enet, coefs_enet, _ = enet_path(
        X, y, eps=eps, l1_ratio=0.8, fit_intercept=False)

    print("Computing regularization path using the positive elastic net...")
    alphas_positive_enet, coefs_positive_enet, _ = enet_path(
        X, y, eps=eps, l1_ratio=0.8, positive=True, fit_intercept=False)

    # Display results

    colors = cycle(['b', 'r', 'g'])
    plt.figure(1)
    neg_log_alphas_lasso = -np.log10(alphas_lasso)
    neg_log_alphas_enet = -np.log10(alphas_enet)
    for coef_l, coef_e, c in zip(coefs_lasso, coefs_enet, colors):
        l1 = plt.plot(neg_log_alphas_lasso, coef_l, c=c)
        l2 = plt.plot(neg_log_alphas_enet, coef_e, linestyle='--', c=c)

    plt.xlabel('-Log(alpha)')
    plt.ylabel('coefficients')
    plt.title('Lasso and Elastic-Net Paths')
    plt.legend((l1[-1], l2[-1]), ('Lasso', 'Elastic-Net'), loc='upper right')
    plt.axis('tight')


    plt.figure(2)
    neg_log_alphas_positive_lasso = -np.log10(alphas_positive_lasso)
    for coef_l, coef_pl, c in zip(coefs_lasso, coefs_positive_lasso, colors):
        l1 = plt.plot(neg_log_alphas_lasso, coef_l, c=c)
        l2 = plt.plot(neg_log_alphas_positive_lasso, coef_pl, linestyle='--', c=c)

    plt.xlabel('-Log(alpha)')
    plt.ylabel('coefficients')
    plt.title('Lasso and positive Lasso')
    plt.legend((l1[-1], l2[-1]), ('Lasso', 'positive Lasso'), loc='upper right')
    plt.axis('tight')


    plt.figure(3)
    neg_log_alphas_positive_enet = -np.log10(alphas_positive_enet)
    for (coef_e, coef_pe, c) in zip(coefs_enet, coefs_positive_enet, colors):
        l1 = plt.plot(neg_log_alphas_enet, coef_e, c=c)
        l2 = plt.plot(neg_log_alphas_positive_enet, coef_pe, linestyle='--', c=c)

    plt.xlabel('-Log(alpha)')
    plt.ylabel('coefficients')
    plt.title('Elastic-Net and positive Elastic-Net')
    plt.legend((l1[-1], l2[-1]), ('Elastic-Net', 'positive Elastic-Net'),
               loc='upper right')
    plt.axis('tight')
    plt.show()
    
plot_path(X)