In [6]:
import pandas as pd

# Replace 'your_clickbait_dataset.csv' with the path to your dataset
dataset = pd.read_csv('clickbait_data.csv')

# Turn the clickbait column into a boolean column
dataset['clickbait'] = dataset['clickbait'].astype(bool)

# Display information about the dataset
print(dataset.head())

                                            headline  clickbait
0                                 Should I Get Bings       True
1      Which TV Female Friend Group Do You Belong In       True
2  The New "Star Wars: The Force Awakens" Trailer...       True
3  This Vine Of New York On "Celebrity Big Brothe...       True
4  A Couple Did A Stunning Photo Shoot With Their...       True


In [7]:
import re

def add_number_columns(headline: str) -> pd.Series:
    # Add columns for headlines with no numbers, numbers at the start, and numbers in the middle
    no_number = not bool(re.search(r'\d+', headline))
    number_start = bool(re.match(r'^\d+', headline))
    number_middle = bool(re.search(r'\d+', headline) and not number_start)
    
    return pd.Series([no_number, number_start, number_middle])

# Apply the add_number_columns function to each headline in the dataset
dataset[['NoNumber', 'NumberStart', 'NumberMiddle']] = dataset['headline'].apply(add_number_columns)

In [8]:
def add_special_character_columns(headline: str) -> pd.Series:
    """
    Add columns for special characters '-', '=', "'", and '.'.
    These characters were chosen because they are the top 4 special characters
    found in clickbait and non-clickbait headlines, as shown in the plot above.
    """
    has_minus = '-' in headline
    has_equals = '=' in headline
    has_apostrophe = "'" in headline
    has_period = '.' in headline
    
    return pd.Series([has_minus, has_equals, has_apostrophe, has_period])

# Apply the add_special_character_columns function to each headline in the dataset
dataset[['HasMinus', 'HasEquals', 'HasApostrophe', 'HasPeriod']] = dataset['headline'].apply(add_special_character_columns)

In [9]:
import nltk

# Download required resources
# nltk.download('punkt')
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('universal_tagset')

# Tokenize and POS tag each headline
tokens = dataset['headline'].apply(nltk.word_tokenize)
pos_tags = tokens.apply(lambda tokens: [tag for word, tag in nltk.pos_tag(tokens)])

# Define the POS tags and their descriptions
pos_tags_dict = {
    'CC': 'coordinating conjunction',
    'CD': 'cardinal digit',
    'DT': 'determiner',
    'EX': 'existential there',
    'FW': 'foreign word',
    'IN': 'preposition/subordinating conjunction',
    'JJ': 'adjective',
    'JJR': 'adjective, comparative',
    'JJS': 'adjective, superlative',
    'LS': 'list marker',
    'MD': 'modal',
    'NN': 'noun, singular',
    'NNS': 'noun plural',
    'NNP': 'proper noun, singular',
    'NNPS': 'proper noun, plural',
    'PDT': 'predeterminer',
    'POS': 'possessive ending',
    'PRP': 'personal pronoun',
    'PRP$': 'possessive pronoun',
    'RB': 'adverb',
    'RBR': 'adverb, comparative',
    'RBS': 'adverb, superlative',
    'RP': 'particle',
    'TO': 'to',
    'UH': 'interjection',
    'VB': 'verb, base form',
    'VBD': 'verb, past tense',
    'VBG': 'verb, gerund/present participle',
    'VBN': 'verb, past participle',
    'VBP': 'verb, sing. present, non-3d',
    'VBZ': 'verb, 3rd person sing. present',
    'WDT': 'wh-determiner',
    'WP': 'wh-pronoun',
    'WP$': 'possessive wh-pronoun',
    'WRB': 'wh-adverb'
}

# Create a DataFrame to store the POS tag columns
pos_columns = pd.DataFrame(index=dataset.index)

# Add columns for each POS tag and initialize them to False
for tag, description in pos_tags_dict.items():
    pos_columns[description] = False

# Set the corresponding POS tag columns to True for each headline
for i, tags in enumerate(pos_tags):
    for tag in tags:
        if tag in pos_tags_dict:
            pos_columns.at[i, pos_tags_dict[tag]] = True

# Add the POS tag columns to the original dataset
dataset = pd.concat([dataset, pos_columns], axis=1)

In [10]:
# Drop the 'headline' column before calculating the correlation matrix
correlation_matrix = dataset.drop(columns=['headline']).corr()

# Get the correlation of each feature with the 'clickbait' column
correlation_with_clickbait = correlation_matrix['clickbait'].abs().sort_values(ascending=False)

# Remove the 'clickbait' column from the list
correlation_with_clickbait = correlation_with_clickbait.drop('clickbait')

# Get the top 20 features with the highest correlation
top_20_features = correlation_with_clickbait.index[:6]

# Keep only the top 5 features + 'clickbait' and 'headline' in the dataset
dataset = dataset[['headline', 'clickbait'] + list(correlation_with_clickbait.index[:6])]

In [11]:
dataset.head()

Unnamed: 0,headline,clickbait,personal pronoun,NumberStart,"noun, singular",determiner,NoNumber,cardinal digit
0,Should I Get Bings,True,True,False,False,False,True,False
1,Which TV Female Friend Group Do You Belong In,True,True,False,True,False,True,False
2,"The New ""Star Wars: The Force Awakens"" Trailer...",True,True,False,True,True,True,False
3,"This Vine Of New York On ""Celebrity Big Brothe...",True,False,False,False,True,True,False
4,A Couple Did A Stunning Photo Shoot With Their...,True,True,False,True,True,True,False


In [12]:
import numpy as np
from tensorflow.keras.layers import Embedding, Conv1D, LSTM, Dense, Input, Concatenate, GlobalMaxPooling1D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Parameters
max_words = 10000
embedding_dim = 100

# Tokenizer erstellen und auf Daten fitten
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(dataset['headline'])

# Textdaten vorbereiten
sequences = tokenizer.texts_to_sequences(dataset['headline'])
padded = pad_sequences(sequences)

# Zusätzliche Features vorbereiten
extra_features = dataset[['personal pronoun', 'NumberStart', 'noun, singular', 
                            'determiner', 'NoNumber', 'cardinal digit']].astype(float)

# Normierung der numerischen Features
scaler = StandardScaler()
extra_features = scaler.fit_transform(extra_features)

# Labels
labels = dataset['clickbait']

# Aufteilen in Training (70%), Temp (30%)
X_text_train, X_text_temp, X_extra_train, X_extra_temp, y_train, y_temp = train_test_split(
    padded, extra_features, labels, test_size=0.3, random_state=42
)

# Aufteilen Temp in Validierung (15%) und Test (15%)
X_text_val, X_text_test, X_extra_val, X_extra_test, y_val, y_test = train_test_split(
    X_text_temp, X_extra_temp, y_temp, test_size=0.5, random_state=42
)

# **Modell mit zwei Eingängen definieren**
# Input 1: Textdaten
input_text = Input(shape=(padded.shape[1],), name="text_input")
embedding_layer = Embedding(input_dim=max_words, output_dim=embedding_dim)(input_text)
conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
lstm_layer = LSTM(128)(embedding_layer)

# Input 2: Zusätzliche numerische Features
input_extra = Input(shape=(extra_features.shape[1],), name="extra_input")
extra_dense = Dense(32, activation='relu')(input_extra)

# Zusammenführen beider Pfade
concatenated = Concatenate()([pooling_layer, lstm_layer, extra_dense])
dense_layer = Dense(64, activation='relu')(concatenated)
dropout_layer = Dropout(0.3)(dense_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

# Modell erstellen
model = Model(inputs=[input_text, input_extra], outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Modell trainieren mit expliziter Validierung
model.fit(
    [X_text_train, X_extra_train], y_train,
    epochs=7,
    batch_size=32,
    validation_data=([X_text_val, X_extra_val], y_val)
)

# Modell evaluieren auf dem Test-Set
test_loss, test_accuracy = model.evaluate([X_text_test, X_extra_test], y_test)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

# Klassifikationsbericht für das Test-Set
predictions = model.predict([X_text_test, X_extra_test])
predicted_labels = (predictions > 0.5).astype(int).flatten()
print(classification_report(y_test, predicted_labels, target_names=['Non-Clickbait', 'Clickbait']))



Epoch 1/7
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.9193 - loss: 0.2077 - val_accuracy: 0.9798 - val_loss: 0.0541
Epoch 2/7
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.9925 - loss: 0.0233 - val_accuracy: 0.9796 - val_loss: 0.0560
Epoch 3/7
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.9979 - loss: 0.0058 - val_accuracy: 0.9787 - val_loss: 0.0933
Epoch 4/7
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.9995 - loss: 0.0015 - val_accuracy: 0.9792 - val_loss: 0.0991
Epoch 5/7
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.9998 - loss: 4.8460e-04 - val_accuracy: 0.9794 - val_loss: 0.1221
Epoch 6/7
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.9994 - loss: 0.0017 - val_accuracy: 0.9760 - val_loss: 0.1143
Epoch 7/7
[1m700/700[0m [3

In [13]:
def predict_clickbait(headline):
    # Tokenize and POS tag the input headline
    tokens = nltk.word_tokenize(headline)
    pos_tags = [tag for word, tag in nltk.pos_tag(tokens)]

    # Create a DataFrame with the same structure as the dataset
    test_data = pd.DataFrame(index=[0])
    test_data['headline'] = headline

    # Add columns for the selected POS tags and initialize them to False
    selected_tags = ['personal pronoun', 'NumberStart', 'noun, singular', 
                     'determiner', 'NoNumber', 'cardinal digit']
    for tag in selected_tags:
        test_data[tag] = False

    # Check if the headline starts with a number and set NumberStart to True if it does
    if re.match(r'^\d+', headline):
        test_data.at[0, 'NumberStart'] = True

    # Check if the headline does not contain any numbers and set NoNumber to True if it doesn't
    if not bool(re.search(r'\d+', headline)):
        test_data.at[0, 'NoNumber'] = True

    # Set the corresponding POS tag columns to True if the tag is present in the headline
    for tag in pos_tags:
        if tag in pos_tags_dict and pos_tags_dict[tag] in selected_tags:
            test_data.at[0, pos_tags_dict[tag]] = True

    # Tokenize and pad the input headline
    sequence = tokenizer.texts_to_sequences([headline])
    padded_sequence = pad_sequences(sequence, maxlen=padded.shape[1])

    # Prepare additional features
    extra_features = test_data[['personal pronoun', 'NumberStart', 'noun, singular', 
                                'determiner', 'NoNumber', 'cardinal digit']].astype(float)
    extra_features = scaler.transform(extra_features)

    # Predict the probability of clickbait
    prediction = model.predict([padded_sequence, extra_features])

    # Return the prediction as a percentage
    print(f'The model predicts a {prediction[0][0] * 100:.2f}% chance that the headline is clickbait.')

In [20]:

# Test the model with a sample clickbait headline
predict_clickbait("10 people dead after a plane crash in the ocean") 
predict_clickbait("You won't believe what happened next!")
predict_clickbait("Introduction to Python programming")
predict_clickbait("Why you should never eat at this restaurant again")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
The model predicts a 16.38% chance that the headline is clickbait.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
The model predicts a 100.00% chance that the headline is clickbait.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
The model predicts a 2.96% chance that the headline is clickbait.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
The model predicts a 100.00% chance that the headline is clickbait.
