# Determine the UTF8 storage requirements for all tables and string columns in the database in scope

The script below will determine the UTF8 storage requirements for all tables and string columns in the database in scope, and whether the defined data type length has to be changed when converting to UTF-8. The result will show the table and column names that map to CHAR, NCHAR, VARCHAR, and NVARCHAR data types (including UDTs), their current type size definition, actual byte size, and projected byte size once encoded into UTF-8. 

For example, for the column **EmailAddress** of table **Application.People**, the data type definition is **nvarchar(512)** and the data length for the largest string found in that column is 66 bytes. If changed to **varchar(512)** under a ***_UTF8*** collation, the data length would be reduced to 33 bytes. In this case, converting the column's collation and data type (without changing the size definition) will not cause data loss. 

However, if the projected byte size for the column using UTF-8 would be larger than the current data type definition, then the defined data type length would also have to be enlarged. For example, if a column were defined originally as **nvarchar(100)**<sup>1</sup>, and projected byte size for the column in UTF-8 would be 120, then changing the column type to a **varchar(100)**<sup>2</sup> would result in data loss. The column would have to be defined at least as **varchar(120)**.

<sup>1</sup> Expresses byte-pairs, meaning the column is capable of storing up to 200 bytes of information  
<sup>2</sup> Expresses bytes, meaning the column is capable of storing up to 100 bytes of information

In [7]:
SET NOCOUNT ON;
GO

USE WideWorldImporters
GO

DROP TABLE IF EXISTS #tmpObjects;
GO

CREATE TABLE #tmpObjects (ObjectName sysname, 
	ColumnName sysname, 
	ColumnType sysname, 
	DefinedTypeSize smallint, 
	ActualMaxBytes smallint,
	UTF8BytesNeeded smallint,
	[isdone] bit,
    CONSTRAINT PK_ObjName_ColName
        PRIMARY KEY NONCLUSTERED (ObjectName, ColumnName)
		WITH (IGNORE_DUP_KEY = ON)
	);

INSERT INTO #tmpObjects
SELECT QUOTENAME(SS.[name]) + '.' + QUOTENAME(STbl.[name]), QUOTENAME(SC.[name]), ST.[name], SC.max_length, NULL, NULL, 0
FROM sys.columns AS SC
INNER JOIN sys.types AS ST ON SC.user_type_id = ST.user_type_id
INNER JOIN sys.tables AS STbl ON STbl.[object_id] = SC.[object_id]
INNER JOIN sys.schemas AS SS ON STbl.[schema_id] = SS.[schema_id]
WHERE STbl.[type] = 'U' 
	AND STbl.is_ms_shipped = 0
	--AND STbl.temporal_type IN (0,1)
	AND ST.system_type_id IN (167, 175, 231, 239)
	AND ST.[name] <> 'sysname'
	AND SC.is_hidden = 0
	AND SC.max_length > 0;

DECLARE @OName sysname, @CName sysname, @CurrBytes smallint, @UTF8Bytes smallint, @sqlcmd NVARCHAR(4000), @params NVARCHAR(60), @cnt int, @maxcnt int

SELECT @maxcnt = COUNT(*) FROM #tmpObjects;
SET @cnt = 0 
SET @params = '@CurrBytesOut smallint OUTPUT, @UTF8BytesOut smallint OUTPUT'

WHILE @cnt < @maxcnt
BEGIN
	SELECT TOP 1 @OName = ObjectName, @CName = ColumnName FROM #tmpObjects WHERE isdone = 0
	SELECT @sqlcmd = 'SELECT @CurrBytesOut = MAX(DATALENGTH(' + @CName + ')), @UTF8BytesOut = MAX(DATALENGTH(CAST(' + @CName + ' AS VARCHAR(4000)) COLLATE Latin1_General_100_CI_AI_SC_UTF8)) FROM ' + @OName + ' WITH (NOLOCK)';  

	EXEC sp_executesql @sqlcmd, @params, @CurrBytesOut = @CurrBytes OUTPUT, @UTF8BytesOut = @UTF8Bytes OUTPUT

	UPDATE #tmpObjects
	SET ActualMaxBytes = @CurrBytes, UTF8BytesNeeded = @UTF8Bytes, isdone = 1 
	WHERE ObjectName = @OName AND ColumnName = @CName

	SET @cnt = @cnt + 1
END;

SELECT * FROM #tmpObjects;

ObjectName,ColumnName,ColumnType,DefinedTypeSize,ActualMaxBytes,UTF8BytesNeeded,isdone
[Application].[Cities],[CityName],nvarchar,100,70.0,35.0,1
[Application].[Cities_Archive],[CityName],nvarchar,100,32.0,16.0,1
[Application].[Countries],[Continent],nvarchar,60,46.0,23.0,1
[Application].[Countries],[CountryName],nvarchar,120,42.0,23.0,1
[Application].[Countries],[CountryType],nvarchar,40,30.0,15.0,1
[Application].[Countries],[FormalName],nvarchar,120,104.0,52.0,1
[Application].[Countries],[IsoAlpha3Code],nvarchar,6,6.0,3.0,1
[Application].[Countries],[Region],nvarchar,60,16.0,8.0,1
[Application].[Countries],[Subregion],nvarchar,60,50.0,25.0,1
[Application].[Countries_Archive],[Continent],nvarchar,60,26.0,13.0,1
