Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate a split sitemap (also fix robots.txt) (#4639) #4701

Merged
merged 1 commit into from
Apr 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/4638.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Generate a split sitemap @reebalazs
39 changes: 36 additions & 3 deletions src/express-middleware/sitemap.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,34 @@
import express from 'express';
import { generateSitemap } from '@plone/volto/helpers';
import {
generateSitemap,
generateSitemapIndex,
SITEMAP_BATCH_SIZE,
} from '@plone/volto/helpers/Sitemap/Sitemap';

export const sitemap = function (req, res, next) {
generateSitemap(req).then((sitemap) => {
let start = 0;
let size = undefined;
const { batch: batchStr } = req.params;
if (batchStr !== undefined) {
const batch = parseInt(batchStr);
if (isNaN(batch) || batch === 0 || '' + batch !== batchStr) {
res.status(404);
// Some data, such as the internal API address, may be sensitive to be published
res.send(
`Invalid sitemap name, use sitemap.xml.gz, or batched sitemapN.xml.gz where N is a positive integer.`,
);
return;
}
start = SITEMAP_BATCH_SIZE * (batch - 1);
size = SITEMAP_BATCH_SIZE;
}
generateSitemap(req, start, size).then((sitemap) => {
if (Buffer.isBuffer(sitemap)) {
res.set('Content-Type', 'application/x-gzip');
res.set('Content-Disposition', 'attachment; filename="sitemap.xml.gz"');
res.set(
'Content-Disposition',
`attachment; filename="sitemap${batchStr || ''}.xml.gz"`,
);
res.send(sitemap);
} else {
// {"errno":-111, "code":"ECONNREFUSED", "host": ...}
Expand All @@ -16,10 +39,20 @@ export const sitemap = function (req, res, next) {
});
};

export const sitemapIndex = function (req, res, next) {
generateSitemapIndex(req).then((sitemapIndex) => {
res.set('Content-Type', 'application/xml');
res.set('Content-Disposition', 'attachment; filename="sitemap-index.xml"');
res.send(sitemapIndex);
});
};

export default function () {
const middleware = express.Router();

middleware.all('**/sitemap.xml.gz', sitemap);
middleware.all('**/sitemap:batch.xml.gz', sitemap);
middleware.all('**/sitemap-index.xml', sitemapIndex);
middleware.id = 'sitemap.xml.gz';
return middleware;
}
15 changes: 9 additions & 6 deletions src/helpers/Robots/Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,9 @@ import { addHeadersFactory } from '@plone/volto/helpers/Proxy/Proxy';
*/
export const generateRobots = (req) =>
new Promise((resolve) => {
//const url = `${req.protocol}://${req.get('Host')}`;
const request = superagent.get(
`${
config.settings.internalApiPath ?? config.settings.apiPath
}/robots.txt`,
);
const internalUrl =
config.settings.internalApiPath ?? config.settings.apiPath;
const request = superagent.get(`${internalUrl}/robots.txt`);
request.set('Accept', 'text/plain');
const authToken = req.universalCookies.get('auth_token');
if (authToken) {
Expand All @@ -31,6 +28,12 @@ export const generateRobots = (req) =>
if (error) {
resolve(text || error);
} else {
// Plone has probably returned the sitemap link with the internal url.
// If so, let's replace it with the current one.
const url = `${req.protocol}://${req.get('Host')}`;
text = text.replace(internalUrl, url);
// Replace the sitemap with the sitemap index.
text = text.replace('sitemap.xml.gz', 'sitemap-index.xml');
resolve(text);
}
});
Expand Down
46 changes: 44 additions & 2 deletions src/helpers/Sitemap/Sitemap.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,23 @@ import { addHeadersFactory } from '@plone/volto/helpers/Proxy/Proxy';

import config from '@plone/volto/registry';

export const SITEMAP_BATCH_SIZE = 5000;

/**
* Generate sitemap
* @function generateSitemap
* @param {Object} _req Request object
* @return {string} Generated sitemap
*/
export const generateSitemap = (_req) =>
export const generateSitemap = (_req, start = 0, size = undefined) =>
new Promise((resolve) => {
const { settings } = config;
const APISUFIX = settings.legacyTraverse ? '' : '/++api++';
const apiPath = settings.internalApiPath ?? settings.apiPath;
const request = superagent.get(
`${apiPath}${APISUFIX}/@search?metadata_fields=modified&b_size=100000000&use_site_search_settings=1`,
`${apiPath}${APISUFIX}/@search?metadata_fields=modified&b_start=${start}&b_size=${
size !== undefined ? size : 100000000
}&use_site_search_settings=1`,
);
request.set('Accept', 'application/json');
request.use(addHeadersFactory(_req));
Expand All @@ -50,3 +54,41 @@ export const generateSitemap = (_req) =>
}
});
});

/**
* Generate sitemap
* @function generateSitemapIndex
* @param {Object} _req Request object
* @return {string} Generated sitemap index
*/
export const generateSitemapIndex = (_req) =>
new Promise((resolve) => {
const { settings } = config;
const APISUFIX = settings.legacyTraverse ? '' : '/++api++';
const apiPath = settings.internalApiPath ?? settings.apiPath;
const request = superagent.get(
`${apiPath}${APISUFIX}/@search?metadata_fields=modified&b_size=0&use_site_search_settings=1`,
);
request.set('Accept', 'application/json');
const authToken = _req.universalCookies.get('auth_token');
if (authToken) {
request.set('Authorization', `Bearer ${authToken}`);
}
request.end((error, { body } = {}) => {
if (error) {
resolve(body || error);
} else {
const items = Array.from(
{ length: Math.ceil(body.items_total / SITEMAP_BATCH_SIZE) },
(_, i) =>
` <sitemap>
<loc>${toPublicURL('/sitemap' + (i + 1) + '.xml.gz')}</loc>
</sitemap>`,
);
const result = `<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
${items.join('\n')}\n</sitemapindex>`;
resolve(result);
}
});
});