In [1]:
import numpy as np
import re
import itertools
from collections import Counter
import tensorflow as tf


In [11]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    # 不是特定字符都变成空格
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    # 加空格
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    # 匹配2个或多个空白字符变成一个" "空格
    string = re.sub(r"\s{2,}", " ", string)
    # 去掉句子首尾的空白符，再转小写
    return string.strip().lower()

positive_examples = ['the rock is destined to be the 21st century',
                    'effective but too-tepid biopic',
                    'if you sometimes like to go to the movies to have fun']
positive_examples = [s.strip() for s in positive_examples]
negative_examples=['the film provides some great insight into the neurotic mindset',
                  'take care of my cat offers a refreshingly']
negative_examples = [s.strip() for s in negative_examples]
x_text = positive_examples + negative_examples
print(x_text)
x_text = [clean_str(sent) for sent in x_text]

['the rock is destined to be the 21st century', 'effective but too-tepid biopic', 'if you sometimes like to go to the movies to have fun', 'the film provides some great insight into the neurotic mindset', 'take care of my cat offers a refreshingly']
['the rock is destined to be the 21st century', 'effective but too tepid biopic', 'if you sometimes like to go to the movies to have fun', 'the film provides some great insight into the neurotic mindset', 'take care of my cat offers a refreshingly']


In [25]:
positive_labels = [[0, 1] for _ in positive_examples]
negative_labels = [[1, 0] for _ in negative_examples]
print(positive_labels)
y = np.concatenate([positive_labels, negative_labels], 0)
print(y)

[[0, 1], [0, 1], [0, 1]]
[[0 1]
 [0 1]
 [0 1]
 [1 0]
 [1 0]]


In [23]:
a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6]])
c = np.concatenate((a,b),axis=0)
d = np.concatenate((a,b.T),axis = 1)
e = np.concatenate([a,b],0)
print(c)
print(d)
print(e)

[[1 2]
 [3 4]
 [5 6]]
[[1 2 5]
 [3 4 6]]
[[1 2]
 [3 4]
 [5 6]]


In [26]:
string1 = "the movie's ripe , enrapturing beauty will tempt those willing to probe its inscrutable mysteries! "
string2 = clean_str(string1)
print(string2)

the movie 's ripe , enrapturing beauty will tempt those willing to probe its inscrutable mysteries !


In [33]:
document_len = [len(x.split(" ")) for x in x_text]
print(document_len)
max_len = max(document_len)
print(max_len)

[9, 5, 12, 10, 8]
12


In [36]:
from tensorflow.contrib import learn

In [57]:
vocab = learn.preprocessing.VocabularyProcessor(max_len)
#将列表转换为数组
x = np.array(list(vocab.fit_transform(x_text)))
print(x)

[[ 1  2  3  4  5  6  1  7  8  0  0  0]
 [ 9 10 11 12 13  0  0  0  0  0  0  0]
 [14 15 16 17  5 18  5  1 19  5 20 21]
 [ 1 22 23 24 25 26 27  1 28 29  0  0]
 [30 31 32 33 34 35 36 37  0  0  0  0]]


In [41]:
import random
list1 = [20, 16, 10, 5]
random.shuffle(list1)
print("随机排序列表 : ",  list1)

random.shuffle(list1)
print("随机排序列表 : ",  list1)

随机排序列表 :  [20, 16, 10, 5]
随机排序列表 :  [10, 20, 5, 16]


In [47]:
import numpy as np
from numpy.random import rand
np.random.seed(3)
a = rand(5)
print("first:",a)

first: [0.5507979  0.70814782 0.29090474 0.51082761 0.89294695]


In [49]:
np.random.seed(3)
a = rand(5)
print("second:",a)

second: [0.5507979  0.70814782 0.29090474 0.51082761 0.89294695]


In [50]:
#数字0-4排序
b = np.random.permutation(5)
print(b)

[0 4 1 3 2]


In [51]:
#list排序
c = np.random.permutation([1,2,3,4,5,6,7,8,9,0])
print(c)

[4 9 3 2 0 5 1 7 8 6]


In [52]:
aa = np.arange(3)
print(aa)

[0 1 2]


In [53]:
bb = np.arange(3,9)
print(bb)

[3 4 5 6 7 8]


In [54]:
cc = np.arange(1,10,2)#起点1，终点10，步长2
print(cc)

[1 3 5 7 9]


In [60]:
print(x)
print(y)
print("______")
np.random.seed(10)
shuffle_indices1 = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices1]
y_shuffled = y[shuffle_indices1]
print(x_shuffled)
print(y_shuffled)

[[ 1  2  3  4  5  6  1  7  8  0  0  0]
 [ 9 10 11 12 13  0  0  0  0  0  0  0]
 [14 15 16 17  5 18  5  1 19  5 20 21]
 [ 1 22 23 24 25 26 27  1 28 29  0  0]
 [30 31 32 33 34 35 36 37  0  0  0  0]]
[[0 1]
 [0 1]
 [0 1]
 [1 0]
 [1 0]]
______
[[14 15 16 17  5 18  5  1 19  5 20 21]
 [ 1 22 23 24 25 26 27  1 28 29  0  0]
 [ 1  2  3  4  5  6  1  7  8  0  0  0]
 [30 31 32 33 34 35 36 37  0  0  0  0]
 [ 9 10 11 12 13  0  0  0  0  0  0  0]]
[[0 1]
 [1 0]
 [0 1]
 [1 0]
 [0 1]]


In [63]:
import tensorflow as tf
with tf.Session() as sess:
    print(sess.run(tf.random_uniform(
        (6,6), minval=-0.5,
        maxval=0.5, dtype=tf.float32)))

[[-0.08788133  0.22054613 -0.03172755 -0.13382006  0.26269603  0.35440767]
 [ 0.09093058  0.12063205 -0.3468814  -0.00612164 -0.47694886  0.31444407]
 [-0.35818386  0.23495948  0.4968568  -0.13550162 -0.29231262 -0.49774373]
 [-0.36301184  0.15901637  0.02170622 -0.29145586 -0.2537732   0.20505822]
 [ 0.06371689  0.07469189 -0.08913267  0.01038587 -0.28714263  0.34437072]
 [-0.43197167 -0.15391958  0.3788265  -0.10389376 -0.47904682  0.3731209 ]]


In [2]:
#enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列，同时列出数据和数据下标
data1 = ['how','are','you']
print(list(enumerate(data1)))

[(0, 'how'), (1, 'are'), (2, 'you')]


In [8]:
#shape
arr1= np.array([1,2,3,4,5,6,7,8])
print(arr1)
arr2 = arr1.reshape(2,4)
print(arr2)
arr3 = arr1.reshape(2,2,2)
print(arr3)

[1 2 3 4 5 6 7 8]
[[1 2 3 4]
 [5 6 7 8]]
[[[1 2]
  [3 4]]

 [[5 6]
  [7 8]]]


SyntaxError: invalid syntax (<ipython-input-10-cd8e3c46e6c7>, line 1)

DuplicateFlagError: The flag 'dev_sample_percentage' is defined twice. First from train.py, Second from train.py.  Description from first occurrence: Percentage of the training data to use for validation