In [1]:
%pip install spacy scikit-learn pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import spacy 
import pandas as pd 
import numpy as np 

nlp = spacy.load("en_core_web_md")

In [4]:
data = {
    'headline': [
        "Stock market rises by 2% today",                 # Normal
        "President to visit Europe next week",            # Normal
        "New study shows benefits of drinking water",     # Normal
        "The weather forecast for Sunday",                # Normal
        "Local team wins the championship",               # Normal
        
        "You won't believe what happened next!",          # Clickbait
        "10 things they don't want you to know",          # Clickbait
        "This one trick will make you a millionaire",     # Clickbait
        "Doctors hate him! See how he did it",            # Clickbait
        "SHOCKING revelation about the royal family"      # Clickbait
    ],
    'label': [
        "Normal", "Normal", "Normal", "Normal", "Normal",
        "Clickbait", "Clickbait", "Clickbait", "Clickbait", "Clickbait"
    ]
}

df = pd.DataFrame(data)
df

Unnamed: 0,headline,label
0,Stock market rises by 2% today,Normal
1,President to visit Europe next week,Normal
2,New study shows benefits of drinking water,Normal
3,The weather forecast for Sunday,Normal
4,Local team wins the championship,Normal
5,You won't believe what happened next!,Clickbait
6,10 things they don't want you to know,Clickbait
7,This one trick will make you a millionaire,Clickbait
8,Doctors hate him! See how he did it,Clickbait
9,SHOCKING revelation about the royal family,Clickbait


In [5]:
def get_vector(text):
    doc = nlp(text)
    return doc.vector


df['vector'] = df['headline'].apply(get_vector)

X = np.stack(df['vector'].values)
y = df['label']

print("Shape of Input Matrix:", X.shape)

Shape of Input Matrix: (10, 300)


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC 

X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size=0.2, random_state=42
)

model = SVC()
model.fit(X_train, y_train)

0,1,2
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"kernel  kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf' Specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. For an intuitive visualization of different kernel types see :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.",'rbf'
,"degree  degree: int, default=3 Degree of the polynomial kernel function ('poly'). Must be non-negative. Ignored by all other kernels.",3
,"gamma  gamma: {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses  1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features - if float, must be non-negative. .. versionchanged:: 0.22  The default value of ``gamma`` changed from 'auto' to 'scale'.",'scale'
,"coef0  coef0: float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.",0.0
,"shrinking  shrinking: bool, default=True Whether to use the shrinking heuristic. See the :ref:`User Guide `.",True
,"probability  probability: bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `.",False
,"tol  tol: float, default=1e-3 Tolerance for stopping criterion.",0.001
,"cache_size  cache_size: float, default=200 Specify the size of the kernel cache (in MB).",200
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",


In [7]:
def predict_clickbait(headline):
    vec = get_vector(headline)

    vec = vec.reshape(1,-1)

    prediction = model.predict(vec)[0]

    return prediction

In [8]:
h1 = "Scientists discover new species of frog"
h2 = "This secret ingredient will change your life forever"

print(f"'{h1}' -> {predict_clickbait(h1)}")
print(f"'{h2}' -> {predict_clickbait(h2)}")

'Scientists discover new species of frog' -> Normal
'This secret ingredient will change your life forever' -> Clickbait
