In [1]:
import numpy as np 
import pandas as pd 
import warnings;warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM,Flatten
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

Using TensorFlow backend.


In [2]:
df_train = pd.read_csv('../input/train.csv',lineterminator='\n')
df_test = pd.read_csv('../input/test.csv',lineterminator='\n')

In [3]:
df_train['label'] = df_train['label'].map({'Negative':0,'Positive':1})
df_train.head()

Unnamed: 0,ID,review,label
0,1,Jo bhi ap se tou behtar hoon,0
1,2,ya Allah meri sister Affia ki madad farma,1
2,3,Yeh khud chahta a is umar main shadi krna. ha...,0
3,4,Tc ? Apky mun xe exe alfax achy nae lgty 😒💃,0
4,5,Good,1


In [4]:
def cleaner(word):
  word = re.sub(r'\#\.', '', word)
  word = re.sub(r'\n', '', word)
  word = re.sub(r',', '', word)
  word = re.sub(r'\-', ' ', word)
  word = re.sub(r'\.', '', word)
  word = re.sub(r'\\', ' ', word)
  word = re.sub(r'\\x\.+', '', word)
  word = re.sub(r'\d', '', word)
  word = re.sub(r'^_.', '', word)
  word = re.sub(r'_', ' ', word)
  word = re.sub(r'^ ', '', word)
  word = re.sub(r' $', '', word)
  word = re.sub(r'\?', '', word)
  return word.lower() 

def array_cleaner(array):
  # X = array
  X = []
  for sentence in array:
    clean_sentence = ''
    words = sentence.split(' ')
    for word in words:
      clean_sentence = clean_sentence +' '+ cleaner(word)
    X.append(clean_sentence)
  return X

In [5]:
X_test = df_test['review']
X_train = df_train['review']
y_train = df_train['label']

In [6]:
# Clean X here
X_train = array_cleaner(X_train)
X_test = array_cleaner(X_test)
print(len(X_train))
print(len(X_test))
print(len(y_train))

6328
2712
6328


In [7]:
y_train = np.array(y_train)
y_train = y_train.astype('int8')
y_train[:6]

X_all = X_train + X_test # Combine both to fit the tokenizer.
lentrain = len(X_train)

In [8]:
maxlen = 120
max_features = 5000

tokenizer = Tokenizer(
    nb_words=max_features,                     #num_words用来初始化一个Tokenizer类，表示用多少词语生成词典（vocabulary），给定以后，就用most common的K个数生成vocabulary
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',#给定一个char的list或者一个字符串，每个char都是要过滤掉的字符，比如标点符号等，默认的就是上面的一串
    lower=True,             #这是一个布尔值，lower=True的话表示都转成小写
    split=' '               #传一个string，这是分词的seperator。默认是空格，也就是遇到空格就分开成两个词
)
tokenizer.fit_on_texts(X_all)
X = tokenizer.texts_to_sequences(X_all)
X = pad_sequences(X, maxlen=maxlen)
print(X.shape)

(9040, 120)


In [9]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, concatenate
from keras.layers import CuDNNGRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D

batch_size = 32

def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, 100)(inp)
    x = CuDNNGRU(64, return_sequences=True)(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(1, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model
model = get_model() 
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 120)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 120, 100)     500000      input_1[0][0]                    
__________________________________________________________________________________________________
cu_dnngru_1 (CuDNNGRU)          (None, 120, 64)      31872       embedding_1[0][0]                
__________________________________________________________________________________________________
global_average_pooling1d_1 (Glo (None, 64)           0           cu_dnngru_1[0][0]                
_____________________________________

In [10]:
# embed_dim = 128
# lstm_out = 256
# batch_size = 32

# model = Sequential()
# model.add(Embedding(2000,embed_dim, input_length=X.shape[1],dropout = 0.2))
# model.add(LSTM(lstm_out, dropout_U = 0.2, dropout_W = 0.2,return_sequences=True))
# model.add(Flatten())
# model.add(Dense(2,activation='softmax'))
# model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
# # model.compile(loss = 'binary_crossentropy',optimizer='adam',metrics = ['accuracy'])
# print(model.summary())

In [11]:
X_train = X[:lentrain] # Separate back into training and test sets. 
X_test = X[lentrain:]
#y_binary = to_categorical(y_train)

In [12]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.x_val,self.y_val = validation_data
    def on_epoch_end(self, epoch, log={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.x_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print('\n ROC_AUC - epoch:%d - score:%.6f \n' % (epoch+1, score))
#             print(y_pred[:6])
x_train5,y_train5,x_label5,y_label5 = train_test_split(X_train,y_train, train_size=0.8, random_state=2019)
RocAuc = RocAucEvaluation(validation_data=(y_train5,y_label5), interval=1)

In [13]:
hist = model.fit(x_train5, x_label5, batch_size=batch_size, epochs=1, validation_data=(y_train5, y_label5), callbacks=[RocAuc], verbose=True)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 5062 samples, validate on 1266 samples
Epoch 1/1

 ROC_AUC - epoch:1 - score:0.818062 



In [14]:
#y_lstm = model.predict_proba(X_test,batch_size=batch_size)[:,1]
y_lstm = model.predict(X_test, batch_size=batch_size)

In [15]:
print(len(y_lstm))
y_lstm[:7,0]

2712


array([0.18183276, 0.56328124, 0.8693909 , 0.59946   , 0.19375345,
       0.86305207, 0.17830741], dtype=float32)

In [16]:
lstm_output = pd.DataFrame(data={"ID":df_test["ID"], "Pred":y_lstm[:,0]})
lstm_output.to_csv('lstm_new.csv', index = False, quoting = 3)