In [58]:
import pandas as pd
import numpy as np

In [59]:
train_df = pd.read_csv('train.csv',usecols=['text','sentiment'])
train_df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [60]:
test_df = pd.read_csv('test.csv',usecols=['text','sentiment'])
test_df.text

0       Last session of the day  http://twitpic.com/67ezh
1        Shanghai is also really exciting (precisely -...
2       Recession hit Veronique Branquinho, she has to...
3                                             happy bday!
4                  http://twitpic.com/4w75p - I like it!!
                              ...                        
3529    its at 3 am, im very tired but i can`t sleep  ...
3530    All alone in this old house again.  Thanks for...
3531     I know what you mean. My little dog is sinkin...
3532    _sutra what is your next youtube video gonna b...
3533     http://twitpic.com/4woj2 - omgssh  ang cute n...
Name: text, Length: 3534, dtype: object

In [61]:
sentiment = {'neutral':0,'positive':1,'negative':2}
train_df['senti_cat'] = train_df.sentiment.apply(lambda x : sentiment[x])
train_df.head()

Unnamed: 0,text,sentiment,senti_cat
0,"I`d have responded, if I were going",neutral,0
1,Sooo SAD I will miss you here in San Diego!!!,negative,2
2,my boss is bullying me...,negative,2
3,what interview! leave me alone,negative,2
4,"Sons of ****, why couldn`t they put them on t...",negative,2


In [62]:
test_df['senti_cat'] = test_df.sentiment.apply(lambda x : sentiment[x])
test_df.head()

Unnamed: 0,text,sentiment,senti_cat
0,Last session of the day http://twitpic.com/67ezh,neutral,0
1,Shanghai is also really exciting (precisely -...,positive,1
2,"Recession hit Veronique Branquinho, she has to...",negative,2
3,happy bday!,positive,1
4,http://twitpic.com/4w75p - I like it!!,positive,1


In [63]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
import string
from nltk.stem import PorterStemmer
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [64]:
def pre_process(tweet):
  tweet = tweet.strip().lower()
  tweet = re.sub(r'http[s]?://\S+','',tweet)  #/https?\:(\\\\|\/\/)(www.)?/,'' re.sub('http[s]?://\S+', '', text)
  tweet = re.sub(r'(\w)\1+', r'\1',tweet)
  tweet = re.sub(r'[!.?]+','',tweet)
  p_tweet = []
  for t in tweet.split():
    if t not in string.punctuation and t not in stop_words:
      p_tweet.append(ps.stem(t))
  return p_tweet


In [65]:
#test
tweet = 'Sooooo higghhhh'
print(pre_process(tweet))

['high']


In [66]:
vocab = {'__PAD__':0,'__UNK__':1}
for row in train_df.text.to_list():
  process = pre_process(str(row))
  for word in process:
    if word not in vocab:
      vocab[word] = len(vocab)

print(len(vocab))

27461


In [67]:
#test
print(vocab['like'])

58


In [68]:
def text_to_tensor(tweet, unk):
  process = pre_process(str(tweet))
  tensor = []
  for word in process:
    if word in vocab:
      tensor.append(vocab.get(word,unk))
  return tensor

In [69]:
#test
tweet = 'its at 3 am, im very tired but i can`t sleep'
print(text_to_tensor(tweet,vocab['__UNK__']))

[591, 2283, 153, 421, 119, 169]


In [70]:
!pip install -q -U trax

In [71]:
import random
import trax
import trax.fastmath.numpy as fastnp
from trax import layers as tl
random.seed(32)
#trax.supervised.trainer_lib.init_random_number_generators(32)

In [72]:
def datagenerator(data,y,batch_size,vocabdict,UNK = "__UNK__",shuffle = True):
  index = 0
  len_index_lines = len(data)
  index_lines = list(range(len_index_lines))
  if shuffle:
    random.shuffle(index_lines)
  
  X = []
  Y = []
  while True:
    if index >= len_index_lines:
      index = 0
    if shuffle:
      random.shuffle(index_lines)
    x = data[index_lines[index]]
    X.append(text_to_tensor(x,vocabdict[UNK]))
    y_ = y[index_lines[index]]
    Y.append(y_)
    index += 1
    
    if batch_size == len(X):
      max_len = 0 
      for i in range(batch_size):
        if max_len < len(X[i]):
          max_len = len(X[i])
      pad_X = []
      for i in range(batch_size):
        padded = X[i] + [0]*(max_len-len(X[i]))
        pad_X.append(padded)
      
      final_X = fastnp.array(pad_X)
      final_y = fastnp.array(Y)
      final_mask = fastnp.ones_like(final_y)

      yield final_X,final_y,final_mask
      X = []
      Y = []

In [73]:
# divide the data
limit = int(len(train_df)*0.8)
print(int(len(train_df)*0.8))
train_x = list(train_df.text[:limit])
train_y = list(train_df.senti_cat[:limit])
#train_y = list(tl.one_hot(fastnp.array(temp_y),3))
eval_x = list(train_df.text[limit:])
eval_y = list(train_df.senti_cat[limit:])
#eval_y = list(tl.one_hot(fastnp.array(eval_temp_y),3))
print(train_x[0:5])
print(train_y[0:5])

21984
[' I`d have responded, if I were going', ' Sooo SAD I will miss you here in San Diego!!!', 'my boss is bullying me...', ' what interview! leave me alone', ' Sons of ****, why couldn`t they put them on the releases we already bought']
[0, 2, 2, 2, 2]


In [74]:
#test
x, y,z = next(datagenerator(eval_x,eval_y,4,vocab))
print(x)
print(y)
print(z)

[[ 2322   645  2181   428   364   658    13  6875   519   306]
 [  274   733    54   472  4579  2856 24779  3043     0     0]
 [ 2778   169   324   269  1324     0     0     0     0     0]
 [  571 25470  7511  2299  1490  1321     0     0     0     0]]
[1 0 2 0]
[1 1 1 1]


In [75]:
import itertools
batch_size = 16

train_generator = itertools.cycle(datagenerator(train_x,train_y,batch_size,vocab))

eval_generator = itertools.cycle(datagenerator(eval_x,eval_y,batch_size,vocab))



In [76]:
def tweetModel(vocab_size=27461,d_model=50,n_layers=3):
  model = tl.Serial(tl.Embedding(vocab_size,d_model),tl.Mean(axis=1),tl.Dense(n_units = n_layers),tl.LogSoftmax())
  return model

In [77]:
display(tweetModel())

Serial[
  Embedding_27461_50
  Mean
  Dense_3
  LogSoftmax
]

In [78]:
from trax.supervised import training

def train_model(model,train_generator,eval_generator,n_steps = 1,outputdir = '/model'):
  train_task = training.TrainTask(train_generator,loss_layer = tl.CrossEntropyLoss(), optimizer = trax.optimizers.Adam(0.01),n_steps_per_checkpoint=100)

  eval_task = training.EvalTask(eval_generator,metrics = [tl.CrossEntropyLoss(),tl.Accuracy()],n_eval_batches=10)

  train_loop = training.Loop(model,train_task,eval_tasks=[eval_task],output_dir=outputdir)
  train_loop.run(n_steps)
  return train_loop

In [None]:
train_len = int(len(train_df)*0.8)
n_steps = train_len/batch_size
train_loop = train_model(tweetModel(),train_generator,eval_generator,13740)  # 10 * n_steps

In [80]:
model = tweetModel()
model.init_from_file('/model/model.pkl.gz')
#model.init_from_file('model.pkl.gz')

In [86]:
test_x = list(test_df.text)
test_y = list(test_df.senti_cat)

In [87]:
def compute_accuracy(test_generator,model):
  test_x,label,mask = test_generator
  ypred = model(test_x)
  y_pred = np.argmax(ypred,axis = -1)
  batchac = np.sum(y_pred == label)
  total = len(label) 
  return batchac/total

In [92]:
test_gen = next(datagenerator(test_x,test_y,len(test_y),vocab,shuffle=False))
acc = compute_accuracy(test_gen,model)
print(acc)

3534
0.60780984


In [89]:
# prediction
sent = {0:'neutral',1:'positive',2:'negative'}
def predict(test_gen,model):
  test_x,label,mask = test_generator
  ypred = model(test_x)
  y_pred = np.argmax(ypred,axis = -1)
  print(y_pred)
  print("Predicted   Actual")
  for pred,truth in zip(list(y_pred),list(label)):
    print(sent[pred]," ",sent[truth])
  


In [94]:
test_gen = next(datagenerator(test_x,test_y,4,vocab,shuffle=False))
predict(test_gen,model)

[0 1 2 1]
Predicted   Actual
neutral   neutral
positive   positive
negative   negative
positive   positive


In [105]:
text  = "My heart goes out to the Malaysian people. This is such a tragedy. Words can't express how sad it is. I wish we could just have peace. #MH17"
inputs = np.array(text_to_tensor(x,vocab['__UNK__']))
inputs = inputs[None,:]
pred = model(inputs)
pred_sent = np.argmax(pred,axis=-1)
print(sent[pred_sent[0]])

negative
