In [None]:
import tensorflow as tf
import pathlib
from tensorflow.keras import preprocessing

In [None]:
data_url = 'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz'
dataset = tf.keras.utils.get_file(
    'stack_overflow_16k.tar.gz',
    data_url,
    untar=True, #是否解壓縮
    cache_dir='stack_overflow',
    cache_subdir='') #快取的子目錄
dataset_dir = pathlib.Path(dataset).parent

In [None]:
list(dataset_dir.iterdir())

[PosixPath('/tmp/.keras/test'),
 PosixPath('/tmp/.keras/stack_overflow_16k.tar.gz.tar.gz'),
 PosixPath('/tmp/.keras/README.md'),
 PosixPath('/tmp/.keras/train')]

In [None]:
train_dir = dataset_dir/'train' #取得train的路徑
list(train_dir.iterdir())

[PosixPath('/tmp/.keras/train/python'),
 PosixPath('/tmp/.keras/train/csharp'),
 PosixPath('/tmp/.keras/train/javascript'),
 PosixPath('/tmp/.keras/train/java')]

In [None]:
sample_file = train_dir/'python/1755.txt' #取得某一個文檔檢視
with open(sample_file) as f:
  print(f.read())

why does this blank program print true x=true.def stupid():.    x=false.stupid().print x



In [None]:
batch_size = 32
seed = 88

#透過text_dataset_from_directory取得資料
raw_train_ds = preprocessing.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training', #將該資料設定為training用
    seed=seed)

#main_directory/
#...class_a/
#......a_text_1.txt
#......a_text_2.txt
#...class_b/
#......b_text_1.txt
#......b_text_2.txt

#directory：數據來源
#labels：如果設定inferred，則會根據資料夾名稱當作標籤
#label_mode：分為int、categorical、binary
#class_names：可指定標籤名稱或使用inferred自動處理
#batch_size：預設32
#max_length：最長的字串長度
#shuffle：隨機排序資料
#seed：隨機種子
#validation_split：介於0~1之前指定測試資料，通常設定0.2,0.3,0.4
#subset：指定用途，參數只有"training" 或 "validation"
#follow_links：預設為Flase，是否以參照方式訪問子目錄


Found 8000 files belonging to 4 classes.
Using 6400 files for training.


In [None]:
for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(10):
    print("Question: ", text_batch.numpy()[i][:100], '...')
    print("Label:", label_batch.numpy()[i])

Question:  b'can object method have own propertis? for function i can make this:..uniqueinteger.counter = 0;..fun' ...
Label: 2
Question:  b'"getting cards in hand blank i am currently writing a modified version of gin rummy. my problem is i' ...
Label: 1
Question:  b'"blank.net excel reading and writing i have two times in the format..""hours:mins:secs"" and and i w' ...
Label: 0
Question:  b'"writing column entry just one below another in blank i have imported a file containing 8 columns, e' ...
Label: 3
Question:  b'"extract id from json string i have the following json string. what is the best way to extract the i' ...
Label: 0
Question:  b'"ifs and arraylists is there a way to do an if statement where one number is compared to an entire a' ...
Label: 1
Question:  b'"clarification on filter() function; how it works so, i am self-learning html, css, blank. i was goi' ...
Label: 2
Question:  b'"blank - \'too many values to unpack\', and \'list has no attribute \'iteritems\'\' i am tr

In [None]:
for i, label in enumerate(raw_train_ds.class_names):
  print("Label", i, "corresponds to", label)

Label 0 corresponds to csharp
Label 1 corresponds to java
Label 2 corresponds to javascript
Label 3 corresponds to python


In [None]:
raw_val_ds = preprocessing.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation', #將該資料設定為validation用
    seed=seed)

Found 8000 files belonging to 4 classes.
Using 1600 files for validation.


In [None]:
for text_batch, label_batch in raw_val_ds.take(1):
  for i in range(10):
    print("Question: ", text_batch.numpy()[i][:50], '...')
    print("Label:", label_batch.numpy()[i])

Question:  b'"i have trouble with static main method and inputt' ...
Label: 1
Question:  b'"having trouble with this quadratic equation class' ...
Label: 1
Question:  b'"blank error: import module not found i am trying ' ...
Label: 3
Question:  b'"swap the letters of the input string swap the cas' ...
Label: 3
Question:  b'"how to address instance of anonymous class from i' ...
Label: 1
Question:  b'"checking for multiple values in object what would' ...
Label: 2
Question:  b'"blank: no overload for \'listview_itemdrag\' matche' ...
Label: 0
Question:  b'"add new columns to comma delimited text file blan' ...
Label: 3
Question:  b'"blank making a txt file .  possible duplicate:.  ' ...
Label: 1
Question:  b'"lib/class which can execute methods written as st' ...
Label: 1
