In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [3]:
def parse_dataset():
    clickbait = []
    non_clickbait = []
    with open("data/clickbait_data", 'rt') as data_in:
        for line in data_in:
            if line.strip():
                clickbait.append(line.strip())
                
                
    with open("data/non_clickbait_data", 'rt') as data_in:
        for line in data_in:
            if line.strip():
                non_clickbait.append(line.strip())

    return clickbait, non_clickbait

clickbait, non_clickbait = parse_dataset()

In [4]:
def preprocess_titles(titles): 
    return list(map(lambda x: x.lower(), titles))
    
def create_dataframe(clickbait=clickbait, non_clickbait=non_clickbait):
    cb_df = pd.DataFrame({'clickbait': np.ones(len(clickbait)), 'title': preprocess_titles(clickbait)})
    n_cb_df = pd.DataFrame({'clickbait': np.zeros(len(non_clickbait)), 'title': preprocess_titles(non_clickbait)})
    return pd.concat([cb_df, n_cb_df], ignore_index=True)

titles = create_dataframe()

In [6]:
idf_tokenizer = TfidfVectorizer(max_features=30000, stop_words='english').fit(titles['title'])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(titles['title'], titles['clickbait'],
                                                    stratify=titles['clickbait'], 
                                                    test_size=0.25)

In [13]:
train_tokenized = idf_tokenizer.transform(X_train)
test_tokenized = idf_tokenizer.transform(X_test)

print(train_tokenized.shape)
print(test_tokenized.shape)

(24000, 22492)
(8000, 22492)


In [36]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

model = Sequential()
model.add(Dense(128, activation='relu', input_dim=train_tokenized.shape[1]))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 128)               2879104   
_________________________________________________________________
dense_13 (Dense)             (None, 128)               16512     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 64)                8256      
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 33        
Total para

In [37]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(patience=5)

model.fit(train_tokenized, y_train, validation_split=0.3, epochs=50, batch_size=100, callbacks=[early_stopping])

Train on 16800 samples, validate on 7200 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150


<keras.callbacks.History at 0x7f99401a27f0>

In [38]:
pred = model.predict_classes(test_tokenized)

In [39]:
print(classification_report(y_pred=pred, y_true=y_test))
print(confusion_matrix(y_test, pred))

             precision    recall  f1-score   support

        0.0       0.96      0.94      0.95      4000
        1.0       0.94      0.96      0.95      4000

avg / total       0.95      0.95      0.95      8000

[[3742  258]
 [ 158 3842]]
