Adding relevant libraries to the project

In [1]:
from bs4 import BeautifulSoup 
import requests
import html
import random
import pandas as pd
import time
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_squared_error

# Crawling

Creating the main function "get_data_from_goodreads_url" using BeautifulSoup and requests libaries. The function takes a URL as input and retrieves specific information from the HTML content of the webpage,the desired information is located in specific HTML elements with specific classes or attributes.

And finally, the data of that book is exported into DF



In [None]:
def get_data_from_goodreads_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    try:
        name = soup.find('h1', class_='Text Text__title1').get_text()
    except AttributeError:
        name = None

    try:
        author = soup.find('span', class_='ContributorLink__name').get_text()
    except AttributeError:
        author = None

    try:
        counts_to_extract = soup.find('div', class_='RatingStatistics__meta')
        ratings_count = counts_to_extract.find('span', attrs={'data-testid': 'ratingsCount'}).get_text()
        ratings_count_array = ratings_count.split('\xa0')
    except AttributeError:
        ratings_count_array = [None]

    try:
        counts_to_extract = soup.find('div', class_='RatingStatistics__meta')
        reviews_count = counts_to_extract.find('span', attrs={'data-testid': 'reviewsCount'}).get_text()
        reviews_count_array = reviews_count.split('\xa0')
    except AttributeError:
        reviews_count_array = [None]

    try:
        rating_to_extract = soup.find('a', class_='RatingStatistics RatingStatistics__interactive')
        rating_num = rating_to_extract.find('div', class_='RatingStatistics__rating').get_text()
    except AttributeError:
        rating_num = None

    try:
        booked_details = soup.find('div', class_='BookDetails')
        Featured_Details = booked_details.find('div', class_='FeaturedDetails')
        length = Featured_Details.find('p', attrs={'data-testid': 'pagesFormat'}).get_text()
        length_array = length.split(" ")
        length_var = length_array[0]
    except AttributeError:
        length_var = None

    try:
        date = Featured_Details.find('p', attrs={'data-testid': 'publicationInfo'}).get_text()
        date_array = date.split(", ")
        date_var = date_array[-1]
    except AttributeError:
        date_var = None

    try:
        ratings = soup.find('div', class_='RatingsHistogram RatingsHistogram__interactive')
        ratings_array = ratings.find_all('div', class_='RatingsHistogram__bar')

        five_raw = ratings_array[0].find('div', class_='RatingsHistogram__labelTotal').get_text()
        five_votes = five_raw.split(" ")[0]

        four_raw = ratings_array[1].find('div', class_='RatingsHistogram__labelTotal').get_text()
        four_votes = four_raw.split(" ")[0]

        three_raw = ratings_array[2].find('div', class_='RatingsHistogram__labelTotal').get_text()
        three_votes = three_raw.split(" ")[0]

        two_raw = ratings_array[3].find('div', class_='RatingsHistogram__labelTotal').get_text()
        two_votes = two_raw.split(" ")[0]

        first_raw = ratings_array[4].find('div', class_='RatingsHistogram__labelTotal').get_text()
        first_votes = first_raw.split(" ")[0]
    except (AttributeError, IndexError):
        five_votes = four_votes = three_votes = two_votes = first_votes = None

    data = {
        'name': [name],
        'date': [date_var],
        'author': [author],
        'length': [length_var],
        'rating count': [ratings_count_array[0]],
        'review count': [reviews_count_array[0]],
        '5 count rate': [five_votes],
        '4 count rate': [four_votes],
        '3 count rate': [three_votes],
        '2 count rate': [two_votes],
        '1 count rate': [first_votes],
        'rating': [rating_num]
    }

    df = pd.DataFrame(data)
    return df

Using function random from 1 to 1000000, we get a different URL page each loop, and every URL is a different book.

For each book we reach, we run function "get_data_from_goodreads_url" on it, and thus we collect specific data on each book individually, and appends the extracted data to a DataFrame until the DataFrame reaches 6000 rows.

In [None]:
feature_columns = ['name', 'date','author','length','rating count','review count','5 count rate','4 count rate','3 count rate','2 count rate','1 count rate','rating']
df = pd.DataFrame(columns=feature_columns)
rand = random.randrange(1, 1000000)
book = "https://www.goodreads.com/book/show/{}".format(rand)

while df.shape[0] < 6000:
    try:
        rand = random.randrange(1, 1000000)
        book = "https://www.goodreads.com/book/show/{}".format(rand)
        temp_df = get_data_from_goodreads_url(book)
        df = pd.concat([df, temp_df], ignore_index=True)
        time.sleep(6)
    except:
        continue
        

exports the DataFrame `df` to a CSV 

In [None]:
df.to_csv('booksProject.csv')

In [None]:
df

After starting the cleaning, it seemed that the amount of books decreased significantly, therefore it was decided to bring more books from the site in order to increase the DATA

In [None]:
df1 = pd.read_csv('booksProject1.csv')
df2 = pd.read_csv('booksProject2.csv')

df1 = df1.drop('Unnamed: 0', axis=1)
df2 = df2.drop('Unnamed: 0', axis=1)

df = pd.concat([df1, df2])

df.to_csv('combined_books.csv')

df.shape[0]
total_data = df.shape[0] * df.shape[1]
total_data
df.shape[1]

Getting information about the amount of data we have

In [None]:
df.info()

 ## EDA

Downloading duplicates, books that have been entered into DF more than once

In [None]:
df.drop_duplicates(inplace=True)

deleting any non-numeric value in the 'Length' column

In [None]:
df = df[pd.to_numeric(df['length'], errors='coerce').notna()]

Every column that contains information with commas we replace the comma with nothing to delete the comma, for example the number 10,000 will become 10000.

And basically every place where the number is represented in a way that is not INT - we will convert it to INT

In [None]:
attributes = ['length', 'rating count', 'review count', '5 count rate', '4 count rate', '3 count rate', '2 count rate', '1 count rate']
for column in attributes:
    df[column] = df[column].apply(lambda x: int(str(x).replace(',', '')))
    
df[attributes] = df[attributes].astype(int)

In [None]:
We will use function A to obtain descriptive statistics of D, and the distribution and shape of the numerical column distributions

In [None]:
df.describe()

In [None]:
df.info()

All rows in the df containing at least one missing value will be removed, and the DataFrame df will be updated with the modified version.

We will eliminate lines where the rating is equal to 0 since we want to examine the rating of the book, a result of 0 does not give us any data

In [None]:
df.dropna(axis=0, how='any', inplace=True)
df = df[df['rating'] != 0]

We will convert the 'date' to an int number

In [None]:
df['date'] = df['date'].astype(int)

In [None]:
df.info()

The values in the 'rating' column of df will be rounded to one decimal place, and the DataFrame df will be updated.

Get the count of each unique rating value in the 'rating' column, with the values displayed in descending order of their counts.

In [None]:
df['rating'] = df['rating'].round(decimals=1)
df['rating'].value_counts()

#Outliers handling

In the following steps we will download all the exceptions that exist in the columns: 'rating count', 'date', 'length', 'review count', using box plot which shows us in a visual way who are the outliers, and therefore we will set an upper limit and a lower limit and remove all those that are outliers.

In [None]:
sns.boxplot(x=df['rating count'])

In [None]:
lower_bound = 0  
upper_bound = 20000  

df = df[(df['rating count'] >= lower_bound) & (df['rating count'] <= upper_bound)]

In [None]:
sns.boxplot(x=df['rating count'])

In [None]:
sns.boxplot(x=df['date'])

In [None]:
lower_bound = 1800  
upper_bound = 2023  

df = df[(df['date'] >= lower_bound) & (df['date'] <= upper_bound)]

In [None]:
sns.boxplot(x=df['date'])

In [None]:
sns.boxplot(x=df['length'])

In [None]:
lower_bound = 70  
upper_bound = 900 

df = df[(df['length'] >= lower_bound) & (df['length'] <= upper_bound)]

In [None]:
sns.boxplot(x=df['length'])

In [None]:
sns.boxplot(x=df['review count'])

In [None]:
lower_bound = 0  
upper_bound = 1500  

df = df[(df['review count'] >= lower_bound) & (df['review count'] <= upper_bound)]

In [None]:
sns.boxplot(x=df['review count'])

In [None]:
df['review count'].value_counts()

removes a random subset of rows with a 'review count' value of 0 from the DataFrame df, limiting the number of rows dropped to the limit value if it exceeds the number of rows with a 'review count' of 0. The printed output displays the updated value counts of the 'review count' column after the rows are dropped.

In [None]:
limit = 650

zero_indices = df[df['review count'] == 0].index
sampled_indices = np.random.choice(zero_indices, size=min(limit, len(zero_indices)), replace=False)
df.drop(sampled_indices, inplace=True)

print(df['review count'].value_counts())


Getting information about the amount of data we have

In [None]:
df.info()

This code generates a histogram to visualize the distribution of ratings in the DataFrame df. The x-axis represents the rating values, and the y-axis represents the number of books falling within each rating range.

In [None]:
plt.hist(df['rating'], bins=5)
plt.xlabel('rating')
plt.ylabel('number of books')
plt.show()
df.shape[0]

The code filters the DataFrame df to include only rows with a rating of 4.0 and limits the number of rows to 300. Any excess rows beyond this limit are dropped from the DataFrame.

In [None]:
rating_4_indices = df[df['rating'] == 4.0].index
excess_indices = rating_4_indices[:300]
df.drop(excess_indices, inplace=True)

In [None]:
plt.hist(df['rating'], bins=5)
plt.xlabel('rating')
plt.ylabel('number of books')
plt.show()
df.shape[0]

In [None]:
df['rating'].value_counts()

Count the number of books in each length category,Data definition and display of the pie

In [None]:
length_counts = df['length'].value_counts().head(10)

# Plot the pie chart
plt.pie(length_counts, labels=length_counts.index, autopct='%1.1f%%')

plt.title('Distribution of Book Length')
plt.axis('equal')

plt.show()

We can see the most common length of the books is 256, then 192

By using 'Scatter Plot' we present the ratio between the number of ratings and the number of pages of the book

In [None]:
x = df['length']
y = df['rating']

plt.scatter(x, y)

plt.xlabel('Length')
plt.ylabel('Rating')
plt.title('Scatter Plot of Length vs Rating')

plt.show()

Here we can see the relationship between the length and the ratings, that actually the fewer pages there are the fewer ratings, which indicates a preference for shorter books

By using scatter plot again we want to present the relationship between the number of ratings and the final rating of the book

In [None]:
import matplotlib.pyplot as plt

x = df['rating']
y = df['rating count']

plt.scatter(x, y)

plt.xlabel('rating')
plt.ylabel('Rating Count')
plt.title('Scatter Plot of rating vs Rating Count')

plt.show()

We can see that the most highly rated books are the books rated between 3.5 and 4.5

In [None]:
x = df['review count']
y = df['rating count']

plt.scatter(x, y)

plt.xlabel('review count')
plt.ylabel('Rating count')
plt.title('Scatter Plot of review count vs Rating count')

plt.show()

Here we can see that when there is less 'Rating count' there is less 'Reviews count'

Using BoxPlot we want to visualize the distribution of the 'rating' column in the DataFrame.

In [None]:
df.boxplot(column='rating')
plt.xlabel('rating')
plt.ylabel('counts')
plt.show()

It seems that from 2.5 to 5 these are the most common and below 2.0 there are fewer ratings

In [None]:
columns = ['length', 'rating', 'rating count' ,'review count','5 count rate','4 count rate','3 count rate','2 count rate','1 count rate']
selected_df = df[columns]

correlation_matrix = selected_df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.1%', cmap='coolwarm', cbar=True)

plt.title('Correlation Matrix with Percentage')

plt.show()

These lines set the title for the plot and display the heatmap.

The resulting plot shows the correlations between the selected columns, with higher values indicating stronger correlations.

In this case, the only strong correlation is between 'rating count' and 'review count', which is logical because if a user rates the book, most of the time he will also review it.

In [None]:
df.describe()

In [None]:
df.info()

We want to save all the data we changed to a new DF

In [None]:
df.to_csv('df_after_changes.csv', index=False) 

# Machine Learning

In [None]:
df = pd.read_csv('df_after_changes.csv')
#df = df.drop('Unnamed: 0', axis=1)

In [None]:
df.info()

In [None]:
label_encoder = LabelEncoder()
df['name'] = label_encoder.fit_transform(df['name'])
df['author'] = label_encoder.fit_transform(df['author'])

In [None]:
Splitting the data as we learned throughout the course

In [None]:
X = df.drop('rating', axis=1)
y = df['rating']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Defining the parameters we want to test and creating a pipeline as documentation suggested

In [None]:
df.shape[0]*df.shape[1]

In [None]:
linear_param_grid = {
    'linearregression__fit_intercept': [True, False],
}

rf_param_grid = {
    'randomforestregressor__n_estimators': [100, 200, 300],
    'randomforestregressor__max_depth': [None, 5, 10],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__min_samples_leaf': [1, 2, 4],
}

linear_model = make_pipeline(StandardScaler(), LinearRegression())

rf_model = make_pipeline(StandardScaler(), RandomForestRegressor())

Here we performs grid search for both Linear Regression and Random Forest Regression models. It uses scikit-learn's GridSearchCV function to search for the best combination

In [None]:
linear_grid_search = GridSearchCV(estimator=linear_model, param_grid=linear_param_grid, cv=3)
linear_grid_search.fit(X_train, y_train)
best_linear_model = linear_grid_search.best_estimator_

rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=3)
rf_grid_search.fit(X_train, y_train)
#best_rf_model = rf_grid_search.best_estimator_

In [None]:
linear_predictions = best_linear_model.predict(X_test)
linear_mse = mean_squared_error(y_test, linear_predictions)
linear_rmse = linear_mse ** 0.5
linear_r2 = r2_score(y_test, linear_predictions)

rf_predictions = best_rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_rmse = rf_mse ** 0.5
rf_r2 = r2_score(y_test, rf_predictions)

### Printing the evaluation metrics of both models

In [None]:
print("Linear Regression - RMSE:", linear_rmse, "R2:", linear_r2)
print("Best Linear Regression hyperparameters:", linear_grid_search.best_params_)

print("Random Forest Regression - RMSE:", rf_rmse, "R2:", rf_r2)
print("Best Random Forest hyperparameters:", rf_grid_search.best_params_)

Linear Regression - RMSE: 0.5193198963157538 R2: 0.036852363321188575
Best Linear Regression hyperparameters: {'linearregression__fit_intercept': True}
Random Forest Regression - RMSE: 0.06531829982154097 R2: 0.9847632380058123
Best Random Forest hyperparameters: {'randomforestregressor__max_depth': None, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__n_estimators': 200}

In [None]:
method = 'Random Forest Regression'

r2 = 0.9847632380058123
rmse = 0.06531829982154097

bar_width = 0.35

bar_positions = np.arange(2)

plt.bar(bar_positions, [r2, rmse], bar_width, align='center')

plt.xlabel('Metrics')
plt.ylabel('Value')
plt.title('Comparison of R2 and RMSE - Random Forest Regression')
plt.xticks(bar_positions, ['R2', 'RMSE'])

plt.show()

In [None]:
method = 'Linear Regression'

r2 = 0.036852363321188575
rmse =  0.5193198963157538

bar_width = 0.35

bar_positions = np.arange(2)

plt.bar(bar_positions, [r2, rmse], bar_width, align='center')

plt.xlabel('Metrics')
plt.ylabel('Value')
plt.title('Comparison of R2 and RMSE - Linear Regression')
plt.xticks(bar_positions, ['R2', 'RMSE'])

plt.show()

In [None]:
from tabulate import tabulate

data = [
    ['Random Forest Regression', 0.9847632380058123, 0.06531829982154097],
    ['Linear Regression', 0.036852363321188575, 0.5193198963157538]
]

headers = ['Model', 'R2', 'RMSE']

colored_headers = [Back.CYAN+ Fore.BLACK + header + Style.RESET_ALL for header in headers]

table = tabulate(data, colored_headers, tablefmt='fancy_grid')
print(table)
