In [49]:
import numpy as np
import pandas as pd
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Conv1D, MaxPool1D, GlobalMaxPool1D, Embedding, Activation
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras import layers
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
from sklearn import preprocessing

In [50]:
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    stops = stopwords.words('english')
    #print(stops)
    porter = PorterStemmer()
    for word in sentence.split():
        if word in stops:
            sentence = sentence.replace(word, '')
        sentence = sentence.replace(word, porter.stem(word))
    return sentence.lower()

In [51]:
nltk.download('stopwords')
df = pd.read_csv('GEN-sarc-notsarc.csv')
df=df.append(pd.read_csv('HYP-sarc-notsarc.csv'))
df=df.append(pd.read_csv('RQ-sarc-notsarc.csv'))
df['Message'] = df['text'].apply(preprocess_text)
df.head(10)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df=df.append(pd.read_csv('HYP-sarc-notsarc.csv'))
  df=df.append(pd.read_csv('RQ-sarc-notsarc.csv'))


Unnamed: 0,class,id,text,Message
0,notsarc,1,"If that's true, then Freedom of Speech is doom...",if true freedom speech doom harass subjec...
1,notsarc,2,Neener neener - is it time to go in from the p...,neener neener time go playground yet
2,notsarc,3,"Just like the plastic gun fear, the armour pie...",just like plastic gun fear armour pierc bull...
3,notsarc,4,So geology is a religion because we weren't he...,so geolog religion ren see rock form
4,notsarc,5,Well done Monty. Mark that up as your first ev...,well done monti mark first ever honest ac...
5,notsarc,6,But the whole process was done in the courts u...,but whole process de court usg process se...
6,notsarc,7,so we would penalize financially those who wan...,would penal financi want kid
7,notsarc,8,"So, IOW, you are saying that those of us who a...",so iow say us intellig enough realiz ...
8,notsarc,9,your probably referring to the mexican basin c...,probabl refer mexic b crater ms extct prov...
9,notsarc,10,"Not in matters of atheism. Tell me, when scien...",not matter am tell scienc dcov sothg seem...


In [52]:
mes = []
for i in df['Message']:
    mes.append(i.split())
print(mes[:2])

[['if', 'true', 'freedom', 'speech', 'doom', 'harass', 'subject', 'now', 'claim', 'book', 'like', 'harass', 'ban'], ['neener', 'neener', 'time', 'go', 'playground', 'yet']]


In [16]:
word2vec_model = Word2Vec(mes, vector_size=500, window=3, min_count=1, workers=16)
print(word2vec_model)

Word2Vec<vocab=25613, vector_size=500, alpha=0.025>


In [17]:
token = Tokenizer(25613)
token.fit_on_texts(df['Message'])
text = token.texts_to_sequences(df['Message'])
text = pad_sequences(text, 75)
print(text[:2])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0   44   94  606  619 3565 3566  308  440
    77  188    8 3566  328]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0 8745
  8745   50   22 4924  102]]


In [18]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(df['class'])
y = to_categorical(y)
y[:5]


array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [19]:
x_train, x_test, y_train, y_test = train_test_split(np.array(text), y, test_size=0.2, stratify=y,random_state=42)

In [20]:

import keras
keras_model = keras.models.Sequential()
keras_model.add(keras.layers.Embedding(input_dim=len(word2vec_model.wv.key_to_index), output_dim=500, input_length=75, weights=[word2vec_model.wv.vectors],  trainable=True))

keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(GlobalMaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Dense(200))
keras_model.add(Activation('relu'))
keras_model.add(Dropout(0.2))
keras_model.add(Dense(2))
keras_model.add(Activation('softmax'))
keras_model.compile(loss='binary_crossentropy', metrics=['acc'], optimizer='adam')
keras_model.fit(x_train, y_train, batch_size=16, epochs=10, validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3c5570ceb0>

In [21]:
# immediate task transfer learning

In [53]:
imdb_df = pd.read_csv('IMDB Dataset.csv')
imdb_df['Message'] = imdb_df['review'].apply(preprocess_text)
imdb_df.head(10)

Unnamed: 0,review,sentiment,Message
0,One of the other reviewers has mentioned that ...,positive,e review nti watch oz epod hook y r...
1,A wonderful little production. <br /><br />The...,positive,a wder ltl product br br film techniqu unas...
2,I thought this was a wonderful way to spend ti...,positive,i thought wer way spen time o hot summer w...
3,Basically there's a family where a little boy ...,negative,bsic fmily littl boy jke thk zombi clos...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time mey vuy stunng fil...
5,"Probably my all-time favorite movie, a story o...",positive,probabl time fav movi sri selfless sacrific...
6,I sure would like to see a resurrection of a u...,positive,i sure would like see resurrect date seahun...
7,"This show was an amazing, fresh & innovative i...",negative,thi s amaz fresh nov idea first air fir...
8,Encouraged by the positive comments about this...,negative,encourag pos comnt film look ward wch ...
9,If you like original gut wrenching laughter yo...,positive,if like igin gut wrench laughter like movi...


In [23]:
mes = []
for i in df['Message']:
    mes.append(i.split())
for i in imdb_df['Message']:
    mes.append(i.split())
print(mes[:2])
word2vec_model1 = Word2Vec(mes, vector_size=500, window=3, min_count=1, workers=16)
print(word2vec_model1)

[['if', 'true', 'freedom', 'speech', 'doom', 'harass', 'subject', 'now', 'claim', 'book', 'like', 'harass', 'ban'], ['neener', 'neener', 'time', 'go', 'playground', 'yet']]
Word2Vec<vocab=156204, vector_size=500, alpha=0.025>


In [54]:
token1 = Tokenizer(156204)
token1.fit_on_texts(df['Message'].append(imdb_df['Message']))
text1 = token1.texts_to_sequences(imdb_df['Message'])
text1 = pad_sequences(text1, 75)
print(text1[:2])
le = preprocessing.LabelEncoder()
y1 = le.fit_transform(imdb_df['sentiment'])
y1 = to_categorical(y1)
y1[:2]

  token1.fit_on_texts(df['Message'].append(imdb_df['Message']))


[[  152   291 12580  6396   262    19  1220    19  2424  2841   470   167
     33   849    75   169  3087  2361    16  2929    63    16  2058    84
    481  2362  2841   115 32964   221   521  1500   478   374   478   844
   3252  2159   254 21861 18693   935   658    19   207    91 21159   681
    799 18693    32   114   741 32965   581    23   384   720  1033   741
    334   593  2841   145   158  7892  7973  1000    12    28    19   180
     22  4996   232]
 [  560  1758   115 70894   368  2558    40   299   288    18 18276   590
   2251   881  1215  6915  2117    43   412    84 50923   369  1008   251
   2901   356     7   128  2901   204     6    97     1     1  2077   179
    105   441     8   273   582  5098  2159   266    78  4543  1018 25507
   7891  1788    13  8376   120    62  2821 23547   630     8    24 27730
  10772 41169   112   630    23  2454     8 41169 41170 11404     7  2345
    258    43   113]]


array([[0., 1.],
       [0., 1.]], dtype=float32)

In [25]:
# for i in range(len(y1)):
#     if y1[i][0]==0:
#         y1[i][0]=1
#         y1[i][1]=0
#     else:
#         y1[i][1]=0
#         y1[i][0]=1

In [55]:
x_train, x_test, y_train, y_test = train_test_split(np.array(text1), y1, test_size=0.2, stratify=y1, random_state=42)

In [56]:

keras_model = keras.models.Sequential()
keras_model.add(keras.layers.Embedding(input_dim=len(word2vec_model.wv.key_to_index), output_dim=500, input_length=75, weights=[word2vec_model.wv.vectors],  trainable=True))

keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(GlobalMaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Dense(200))
keras_model.add(Activation('relu'))
keras_model.add(Dropout(0.2))
keras_model.add(Dense(2))
keras_model.add(Activation('softmax'))
keras_model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')
keras_model.fit(x_train, y_train, batch_size=512, epochs=10, validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3c549c93a0>

In [57]:
import pickle

filename = 'finalized_model.sav'
pickle.dump(keras_model, open(filename, 'wb'))

In [58]:
text= token1.texts_to_sequences(df['Message'])
text = pad_sequences(text, 75)
print(text[:2])
le = preprocessing.LabelEncoder()
y1 = le.fit_transform(df['class'])
y1 = to_categorical(y1)
y1[:2]

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0    99   194  2911  1647  3809 15971   504   569  1026   151
     11 15971  3565]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0 68859 68859    30
     47 14688   155]]


array([[1., 0.],
       [1., 0.]], dtype=float32)

In [59]:
for i in range(len(y1)):
    
    y1[i][0], y1[i][1] = y1[i][1], y1[i][0]

In [60]:
y1[:5]

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [45]:
x_train, x_test, y_train, y_test = train_test_split(np.array(text), y1, test_size=0.2, stratify=y1, random_state=42)

In [63]:
loaded_model = pickle.load(open("finalized_model.sav", "rb"))

In [64]:
loaded_model.fit(text, y, batch_size=512, epochs=10, validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3b4c8181c0>