In [1]:
import pandas as pd
import numpy as np

**DATASET PREPROCESSING**

In [49]:
df=pd.read_csv("/content/dataset.csv")
df.head()

Unnamed: 0,text,humor
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False
1,Watch: darvish gave hitter whiplash with slow ...,False
2,What do you call a turtle without its shell? d...,True
3,5 reasons the 2016 election feels so personal,False
4,"Pasco police shot mexican migrant from behind,...",False


In [50]:
df['humor'] = df['humor'].map({True: 1, False: 0})

In [51]:
df

Unnamed: 0,text,humor
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",0
1,Watch: darvish gave hitter whiplash with slow ...,0
2,What do you call a turtle without its shell? d...,1
3,5 reasons the 2016 election feels so personal,0
4,"Pasco police shot mexican migrant from behind,...",0
...,...,...
199995,Conor maynard seamlessly fits old-school r&b h...,0
199996,How to you make holy water? you boil the hell ...,1
199997,How many optometrists does it take to screw in...,1
199998,Mcdonald's will officially kick off all-day br...,0


In [52]:
df.shape

(200000, 2)

In [53]:
df.isnull().sum()

text     0
humor    0
dtype: int64

In [54]:
df.dropna()

Unnamed: 0,text,humor
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",0
1,Watch: darvish gave hitter whiplash with slow ...,0
2,What do you call a turtle without its shell? d...,1
3,5 reasons the 2016 election feels so personal,0
4,"Pasco police shot mexican migrant from behind,...",0
...,...,...
199995,Conor maynard seamlessly fits old-school r&b h...,0
199996,How to you make holy water? you boil the hell ...,1
199997,How many optometrists does it take to screw in...,1
199998,Mcdonald's will officially kick off all-day br...,0


In [55]:
X=df.drop('humor',axis=1)
y=df['humor']

In [56]:
y.shape

(200000,)

**TEXT PREPROCESSING**

In [57]:
import tensorflow as tf

In [58]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [59]:
voc_size=5000

In [60]:
import nltk
import re
from nltk.corpus import stopwords

In [61]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [62]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(X)):
    review = re.sub('[^a-zA-Z]', ' ', X['text'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [63]:
corpus

['joe biden rule bid guy run',
 'watch darvish gave hitter whiplash slow pitch',
 'call turtl without shell dead',
 'reason elect feel person',
 'pasco polic shot mexican migrant behind new autopsi show',
 'martha stewart tweet hideou food photo twitter respond accordingli',
 'pokemon master favorit kind pasta wartortellini',
 'nativ american hate rain april bring mayflow',
 'obama climat chang legaci impress imperfect vulner',
 'famili tree cactu prick',
 'donald trump found someth mysteri rudi giuliani',
 'donald trump ted cruz love affair relationship',
 'want know athlet chose takeakne look broken justic system',
 'music candi similar throw away rapper',
 'famou coupl help stay healthi fit',
 'studi find strong link zika guillain barr syndrom',
 'alec baldwin wife hilaria welcom anoth babi boy',
 'trump say iran compli nuclear deal remain danger threat',
 'kim kardashian babi name realiti star discuss k name possibl video',
 'end year relationship fine relationship p',
 'oscar nomi

**CONVERTING TEXT TO VECTOR**

In [64]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr

[[1700, 2962, 1936, 1902, 4026, 2512],
 [3417, 4027, 4470, 4220, 3430, 4391, 3817],
 [3046, 4362, 4384, 4136, 817],
 [2149, 1840, 2652, 2322],
 [4773, 2884, 3104, 747, 3511, 4798, 3225, 2373, 4309],
 [2901, 3603, 3330, 596, 3771, 1819, 1571, 369, 4496],
 [684, 1864, 343, 3679, 3129, 252],
 [4911, 3328, 2373, 3109, 2695, 4606, 2587],
 [4988, 499, 2960, 484, 3825, 154, 4614],
 [1930, 3356, 3501, 1582],
 [3791, 4852, 4335, 3690, 3987, 4804, 157],
 [3791, 4852, 627, 4065, 4637, 3706, 3712],
 [3569, 1380, 112, 1973, 1040, 1297, 2295, 3221, 3855],
 [4269, 4015, 4583, 2113, 596, 737],
 [2139, 4096, 1431, 3962, 2113, 3155],
 [779, 2284, 1444, 4449, 3065, 4324, 4001, 2812],
 [1143, 4140, 1007, 4787, 4551, 678, 49, 1758],
 [4852, 1840, 805, 4791, 99, 628, 472, 234, 486],
 [1810, 4391, 49, 688, 1896, 3199, 1483, 3450, 688, 1799, 2362],
 [1856, 4432, 3712, 2302, 3712, 1170],
 [4812, 3420, 1297, 1525],
 [422, 964, 3321, 422, 4309, 1295, 286],
 [3271, 1460, 394, 415, 2021, 499, 991, 1298, 628],
 [30

In [65]:
onehot_repr[1]

[3417, 4027, 4470, 4220, 3430, 4391, 3817]

**EMBEDDING LAYER AND LSTM**

In [66]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)
print(embedded_docs)

[[1700 2962 1936 ...    0    0    0]
 [3417 4027 4470 ...    0    0    0]
 [3046 4362 4384 ...    0    0    0]
 ...
 [4821 1358 3392 ...    0    0    0]
 [3858 4057 3804 ...    0    0    0]
 [1234 2939 1571 ...    0    0    0]]


In [67]:
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 40)            200000    
                                                                 
 lstm_1 (LSTM)               (None, 100)               56400     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [68]:
len(embedded_docs),y.shape

(200000, (200000,))

In [69]:
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [70]:
X_final.shape,y_final.shape

((200000, 20), (200000,))

In [71]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

**TRAINING THE MODEL**

In [72]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=5,batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fea07b21f90>

In [73]:
y_pred=model.predict(X_test)
y_pred=np.where(y_pred > 0.6, 1,0)



In [74]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87     32985
           1       0.88      0.85      0.86     33015

    accuracy                           0.87     66000
   macro avg       0.87      0.87      0.87     66000
weighted avg       0.87      0.87      0.87     66000



**SAVING THE MODEL AND MAKING PREDICTION**

In [83]:
import pickle
pickle.dump(model,open('humorPred.pkl','wb'))

In [84]:
pickled_model=pickle.load(open('humorPred.pkl','rb'))

In [85]:
sample="i am a good boy"
ps = PorterStemmer()
A = []
review = re.sub('[^a-zA-Z]', ' ', sample)
review = review.lower()
review = review.split()
review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
review = ' '.join(review)
A.append(review)

sample_repr=[one_hot(words,voc_size)for words in A]
sent_length=20
sample_repr=pad_sequences(sample_repr,padding='post',maxlen=sent_length)
print(sample_repr)

[[4895 1758    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]]


In [86]:
ans=pickled_model.predict(sample_repr)
ans = True if ans > 0 else False
ans



True