# How to Fix - UnicodeDecodeError: invalid start byte - during read_csv in Pandas

In [1]:
!cat '../data/csv/file_utf-16.csv'

��a , b , c 
 1 , 2 , 3 

## Step 2: Solution of UnicodeDecodeError: change read_csv encoding 

In [2]:
import pandas as pd
import traceback

try:
    
    df = pd.read_csv('../data/csv/file_utf-16.csv')

except Exception as e:
    traceback.print_exc(limit=1)
    exit(1)

Traceback (most recent call last):
  File "/tmp/ipykernel_18673/2588436354.py", line 6, in <module>
    df = pd.read_csv('../data/csv/file_utf-16.csv')
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


In [3]:
df = pd.read_csv('../data/csv/file_utf-16.csv', encoding='utf-16')
df

Unnamed: 0,a,b,c
0,1,2,3


In [4]:
!file '../data/csv/file_utf-16.csv'

../data/csv/file_utf-16.csv: Little-endian UTF-16 Unicode text


In [5]:
with open('../data/csv/file_utf-16.csv') as f:
    print(f)

<_io.TextIOWrapper name='../data/csv/file_utf-16.csv' mode='r' encoding='UTF-8'>


## Step 3: Solution of UnicodeDecodeError: skip encoding errors with encoding_errors='ignore'

In [6]:
from pathlib import Path
import pandas as pd

file = Path('../data/csv/file_utf-8.csv')
file.write_bytes(b"\xe4\na\n1")  # non utf-8 character

df = pd.read_csv(file, encoding_errors='ignore')
df

Unnamed: 0,a
0,1


In [7]:
import pandas as pd
import traceback

try:
    
    df = pd.read_csv(file, encoding_errors='strict')

except Exception as e:
    traceback.print_exc(limit=1)
    exit(1)

Traceback (most recent call last):
  File "/tmp/ipykernel_18673/2599847558.py", line 6, in <module>
    df = pd.read_csv(file, encoding_errors='strict')
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe4 in position 0: invalid continuation byte


In [8]:
import pandas as pd
import traceback

try:
    
    df = pd.read_csv(file)

except Exception as e:
    traceback.print_exc(limit=1)
    exit(1)

Traceback (most recent call last):
  File "/tmp/ipykernel_18673/1676851375.py", line 6, in <module>
    df = pd.read_csv(file)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe4 in position 0: invalid continuation byte


## Step 4: Solution of UnicodeDecodeError: fix encoding errors with unicode_escape

In [9]:
df = pd.read_csv(file, encoding='unicode_escape')

In [10]:
"""\xe4\na\n1"""

'ä\na\n1'

In [11]:
"""\xe4\na\n1""".encode('unicode-escape')

b'\\xe4\\na\\n1'