<a href="https://colab.research.google.com/github/shake/colab-Llama-2-ipynb/blob/main/YT_Multilingual_LLaMA2_vs_other_models_Chat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%capture
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
!pip install -q datasets sentencepiece

# LLaMA 2, Multilingual Models & Fine Tuning

In [3]:
# import 密钥，token
%%capture
from google.colab import userdata
hf_token = userdata.get('huggingface')
!git config --global credential.helper store
!huggingface-cli login --token $hf_token --add-to-git-credential

## LLaMA2 7B Chat


In [4]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:

tokenizer = AutoTokenizer.from_pretrained("chenshake/Llama-2-7b-chat-hf",
                                          use_auth_token=True,
                                          )

# model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
#                                              device_map='auto',
#                                              torch_dtype=torch.float16,
#                                              use_auth_token=True,
#                                             #  load_in_8bit=True,
#                                             #  load_in_4bit=True
#                                              )

In [6]:
tokenizer.vocab_size

32000

In [7]:
tokenizer.tokenize('This is a tokenizer test')

['▁This', '▁is', '▁a', '▁token', 'izer', '▁test']

In [8]:
tokenizer('This is a tokenizer test')

{'input_ids': [1, 910, 338, 263, 5993, 3950, 1243], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [9]:
tokenizer.tokenize('this is a Tokenizer test')

['▁this', '▁is', '▁a', '▁Token', 'izer', '▁test']

In [10]:
tokenizer('this is a Tokenizer test')

{'input_ids': [1, 445, 338, 263, 25159, 3950, 1243], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [12]:
tokenizer.decode([1, 910, 338, 263, 5993, 3950, 1243])

'<s> This is a tokenizer test'

# LLaMA 2 tokenizer

Is LLaMA 2 got to fine tune for a particular language?


### English

In [13]:
tokenizer.tokenize('This is a tokenizer test')

['▁This', '▁is', '▁a', '▁token', 'izer', '▁test']

In [14]:
tokenizer.tokenize('My name is Sam')

['▁My', '▁name', '▁is', '▁Sam']

### French

mon nom est Sam

In [15]:
tokenizer.tokenize('mon nom est Sam')

['▁mon', '▁nom', '▁est', '▁Sam']

### Thai

In [16]:
tokenizer.tokenize('ผมชื่อแซม'), len(tokenizer.tokenize('ผมชื่อแซม')) # 14 tokens

(['▁',
  '<0xE0>',
  '<0xB8>',
  '<0x9C>',
  'ม',
  'ช',
  'ื',
  '่',
  'อ',
  'แ',
  '<0xE0>',
  '<0xB8>',
  '<0x8B>',
  'ม'],
 14)

### Greek

Το όνομα μου είναι Σαμ

In [17]:
tokenizer.tokenize('Το όνομα μου είναι Σαμ'), len(tokenizer.tokenize('Το όνομα μου είναι Σαμ'))

(['▁',
  'Τ',
  'ο',
  '▁',
  'ό',
  'ν',
  'ο',
  'μ',
  'α',
  '▁',
  'μ',
  'ο',
  'υ',
  '▁',
  'ε',
  'ί',
  'ν',
  'α',
  'ι',
  '▁',
  'Σ',
  'α',
  'μ'],
 23)

### Spanish

In [18]:
tokenizer.tokenize('Me llamo Sam'), len(tokenizer.tokenize('Me llamo Sam'))

(['▁Me', '▁llam', 'o', '▁Sam'], 4)

### Chinese

我的名字叫山姆

In [19]:
tokenizer.tokenize('我的名字叫山姆'), len(tokenizer.tokenize('我的名字叫山姆'))

(['▁',
  '我',
  '的',
  '名',
  '字',
  '<0xE5>',
  '<0x8F>',
  '<0xAB>',
  '山',
  '<0xE5>',
  '<0xA7>',
  '<0x86>'],
 12)

# Bloom Tokenizer

In [20]:
bloom_tokenizer = AutoTokenizer.from_pretrained('bigscience/bloom')

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [21]:
bloom_tokenizer.vocab_size

250680

### English

In [22]:
bloom_tokenizer.tokenize('This is a tokenizer test')

['This', 'Ġis', 'Ġa', 'Ġtoken', 'izer', 'Ġtest']

In [None]:
bloom_tokenizer.tokenize('My name is Sam')

['My', 'Ġname', 'Ġis', 'ĠSam']

### French

mon nom est Sam

In [None]:
bloom_tokenizer.tokenize('mon nom est Sam')

['mon', 'Ġnom', 'Ġest', 'ĠSam']

### Thai

In [None]:
bloom_tokenizer.tokenize('ผมชื่อแซม')

['à¸', 'ľ', 'à¸¡', 'à¸Ĭ', 'à¸·', 'à¹Ī', 'à¸Ń', 'à¹ģ', 'à¸', 'ĭ', 'à¸¡']

### Greek

Το όνομα μου είναι Σαμ

In [None]:
bloom_tokenizer.tokenize('Το όνομα μου είναι Σαμ')

['Î¤',
 'Î¿',
 'ĠÏ',
 'Į',
 'Î½',
 'Î¿',
 'Î¼Î±',
 'ĠÎ¼',
 'Î¿Ïħ',
 'ĠÎµ',
 'Î¯Î½',
 'Î±Î¹',
 'ĠÎ£',
 'Î±',
 'Î¼']

### Spanish

In [None]:
bloom_tokenizer.tokenize('Me llamo Sam')

['Me', 'Ġll', 'amo', 'ĠSam']

### Chinese

我的名字叫山姆

In [None]:
bloom_tokenizer.tokenize('我的名字叫山姆')

['æĪĳçļĦ', 'åĲįåŃĹ', 'åı«', 'å±±å§Ĩ']

# GLM2-6B

In [None]:
glm2_tokenizer = AutoTokenizer.from_pretrained('THUDM/chatglm2-6b', trust_remote_code=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

Downloading (…)enization_chatglm.py:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/THUDM/chatglm2-6b:
- tokenization_chatglm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading tokenizer.model:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

In [None]:
glm2_tokenizer.vocab_size

64794

### English

In [None]:
glm2_tokenizer.tokenize('This is a tokenizer test')

['▁This', '▁is', '▁a', '▁token', 'izer', '▁test']

In [None]:
glm2_tokenizer.tokenize('My name is Sam')

['▁My', '▁name', '▁is', '▁Sam']

### French

mon nom est Sam

In [None]:
glm2_tokenizer.tokenize('mon nom est Sam')

['▁mon', '▁nom', '▁est', '▁Sam']

### Thai

In [None]:
glm2_tokenizer.tokenize('ผมชื่อแซม')

['▁', 'ผ', 'ม', 'ช', 'ื', '่', 'อ', 'แ', '<0xE0>', '<0xB8>', '<0x8B>', 'ม']

### Greek

Το όνομα μου είναι Σαμ

In [None]:
glm2_tokenizer.tokenize('Το όνομα μου είναι Σαμ')

['▁',
 'Τ',
 'ο',
 '▁',
 'ό',
 'ν',
 'ο',
 'μ',
 'α',
 '▁μ',
 'ο',
 'υ',
 '▁',
 'ε',
 'ί',
 'ν',
 'α',
 'ι',
 '▁',
 'Σ',
 'α',
 'μ']

### Spanish

In [None]:
glm2_tokenizer.tokenize('Me llamo Sam')

['▁Me', '▁l', 'lam', 'o', '▁Sam']

### Chinese

我的名字叫山姆

In [None]:
glm2_tokenizer.tokenize('我的名字叫山姆')

['▁我的', '名字', '叫', '山', '姆']

# MT5

In [None]:
mt5_tokenizer = AutoTokenizer.from_pretrained('google/mt5-base')

Downloading (…)okenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565, and set the legacy attribute accordingly.


In [None]:
mt5_tokenizer.vocab_size

250100

### English

In [None]:
mt5_tokenizer.tokenize('This is a tokenizer test')

['▁This', '▁is', '▁', 'a', '▁', 'token', 'izer', '▁test']

In [None]:
mt5_tokenizer.tokenize('My name is Sam')

['▁My', '▁name', '▁is', '▁Sam']

### French

mon nom est Sam

In [None]:
mt5_tokenizer.tokenize('mon nom est Sam')

['▁mon', '▁nom', '▁est', '▁Sam']

### Thai

In [None]:
mt5_tokenizer.tokenize('ผมชื่อแซม')

['▁ผม', 'ชื่อ', 'แซ', 'ม']

## Greek

Το όνομα μου είναι Σαμ

In [None]:
mt5_tokenizer.tokenize('Το όνομα μου είναι Σαμ')

['▁Το', '▁', 'ό', 'νομα', '▁μου', '▁είναι', '▁Σα', 'μ']

## Spanish

In [None]:
mt5_tokenizer.tokenize('Me llamo Sam')

['▁Me', '▁llam', 'o', '▁Sam']

## Chinese

我的名字叫山姆

In [None]:
mt5_tokenizer.tokenize('我的名字叫山姆')

['▁', '我', '的名字', '叫', '山', '姆']

# togethercomputer/RedPajama-INCITE-7B-Base

In [None]:
redpajama_incite_tokenizer = AutoTokenizer.from_pretrained('togethercomputer/RedPajama-INCITE-7B-Base')

Downloading (…)okenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
redpajama_incite_tokenizer.vocab_size

50254

### English

In [None]:
redpajama_incite_tokenizer.tokenize('This is a tokenizer test')

['This', 'Ġis', 'Ġa', 'Ġtoken', 'izer', 'Ġtest']

In [None]:
redpajama_incite_tokenizer.tokenize('My name is Sam')

['My', 'Ġname', 'Ġis', 'ĠSam']

### French

mon nom est Sam

In [None]:
redpajama_incite_tokenizer.tokenize('mon nom est Sam')

['mon', 'Ġnom', 'Ġest', 'ĠSam']

### Thai

In [None]:
redpajama_incite_tokenizer.tokenize('ผมชื่อแซม') # 15 tokens

['à¸', 'ľ', 'à¸¡', 'à¸', 'Ĭ', 'à¸', '·', 'à¹Ī', 'à¸Ń', 'à¹ģ', 'à¸', 'ĭ', 'à¸¡']

## Greek

Το όνομα μου είναι Σαμ

In [None]:
redpajama_incite_tokenizer.tokenize('Το όνομα μου είναι Σαμ')

['Î',
 '¤',
 'Î¿',
 'ĠÏĮ',
 'Î½Î¿',
 'Î¼Î±',
 'ĠÎ¼',
 'Î¿Ïħ',
 'ĠÎµÎ¯Î½Î±Î¹',
 'ĠÎ£',
 'Î±',
 'Î¼']

## Spanish

In [None]:
redpajama_incite_tokenizer.tokenize('Me llamo Sam')

['Me', 'Ġllam', 'o', 'ĠSam']

## Chinese

我的名字叫山姆

In [None]:
redpajama_incite_tokenizer.tokenize('我的名字叫山姆')

['æĪĳ', 'çļĦ', 'åĲį', 'åŃĹ', 'åı', '«', 'å±', '±', 'å§', 'Ĩ']