In [190]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier


In [188]:
# Read in clickbait data, set target as 1 (positive for clickbait)
clickbait = pd.read_csv('clickbait_data.gz', compression='gzip', header=None, delimiter='\t', quotechar='"', names=['message', 'target'])
clickbait['target'] = 1

# Read in non-clickbait data, set target as 0 (negative for clickbait)
non_clickbait = pd.read_csv('non_clickbait_data.gz', compression='gzip', header=None, delimiter='\t', quotechar='"', names=['message', 'target'])
non_clickbait['target'] = 0

# Concatenate clickbait and non-clickbait into one dataframe
data = pd.concat([clickbait, non_clickbait])

# Set up X and y and perform train test split
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['target'], test_size=0.33)

# Create a count vectorizer (bag of words)
# Fit and transform vectorizer to text
vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', token_pattern='[^\W\d_]+')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Verify
#display(vectorizer.get_feature_names_out())
#display(X_train.shape)


array(['aaa', 'aaevpc', 'aaron', ..., 'zurich', 'zykina', 'złoty'],
      dtype=object)

(21440, 18354)

In [186]:
# K-Nearest Neighbors
knn = KNeighborsClassifier()
knn_param_grid = { 'n_neighbors': range(1, 4) }
knn_model = GridSearchCV(knn, knn_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
knn_model.fit(X_train, y_train)
print("Best parameters for KNN: ", knn_model.best_params_)
print("K-Nearest Neighbors Training Score: ", knn_model.score(X_train, y_train))
print("K-Nearest Neighbors Testing Score: ", knn_model.score(X_test, y_test))


Best parameters for KNN:  {'n_neighbors': 2}
K-Nearest Neighbors Training Score:  0.9979011194029851
K-Nearest Neighbors Testing Score:  0.8794507575757575


In [185]:
print("K-Nearest Neighbors Cross Validation Score: ", knn_model.cv_results_['mean_test_score'][1])
print("K-Nearest Neighbors Cross Validation Standard Deviation: ", knn_model.cv_results_['std_test_score'][1])

K-Nearest Neighbors Cross Validation Score:  0.8783582089552239
K-Nearest Neighbors Cross Validation Standard Deviation:  0.015047020588175428


In [178]:
# Naive Bayes
mnb = MultinomialNB()
mnb_param_grid = { 'alpha': [0.0001, 0.001, 0.01, 0.1, 0.5, 0.75, 1] }
mnb_model = GridSearchCV(mnb, mnb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
mnb_model.fit(X_train, y_train)
print("Best parameters for Naive Bayes: ", mnb_model.best_params_)
print("Naive Bayes Training Score: ", mnb_model.score(X_train, y_train))
print("Naive Bayes Testing Score: ", mnb_model.score(X_test, y_test))

Best parameters for Naive Bayes:  {'alpha': 0.5}
Naive Bayes Training Score:  0.9819029850746268
Naive Bayes Testing Score:  0.9585227272727272


In [189]:
print("Naive Bayes Cross Validation Score: ", mnb_model.cv_results_['mean_test_score'][4])
print("Naive Bayes Cross Validation Standard Deviation: ", mnb_model.cv_results_['std_test_score'][4])

Naive Bayes Cross Validation Score:  0.9559235074626866
Naive Bayes Cross Validation Standard Deviation:  0.001978843604066819


In [179]:
# Multilayer Perceptron
mlp = MLPClassifier()
mlp_param_grid = { 'hidden_layer_sizes': [[40, 20], [30, 15], [20, 10], [10, 5]]}
mlp_model = GridSearchCV(mlp, mlp_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
mlp_model.fit(X_train, y_train)
print("Best parameters for Multilayer Perceptron: ", mlp_model.best_params_)
print("Multilayer Perceptron Training Score: ", mlp_model.score(X_train, y_train))
print("Multilayer Perceptron Testing Score: ", mlp_model.score(X_test, y_test))

Best parameters for Multilayer Perceptron:  {'hidden_layer_sizes': [30, 15]}
Multilayer Perceptron Training Score:  1.0
Multilayer Perceptron Testing Score:  0.9543560606060606


In [183]:
print("Multilayer Perceptron Cross Validation Score: ", mlp_model.cv_results_['mean_test_score'][1])
print("Multilayer Perceptron Cross Validation Standard Deviation: ", mlp_model.cv_results_['std_test_score'][1])

Multilayer Perceptron Cross Validation Score:  0.950419776119403
Multilayer Perceptron Cross Validation Standard Deviation:  0.0033066365437996307


Report
I used a Tfidf vectorizer to represent the data in order to adjust the weights of words that were used more/less frequently. I chose accuracy as a metric to show how often the prediction was equal to the label. Accuracy is particularly useful in this case because the data is evenly split. Also, because the data is evenly split, simply guessing clickbait every time would result in 50% accuracy. We expect each of the classifiers to do well above 50% in order to be useful.
For the K-Nearest Neighbors classifier, cross validation chose 2 neighbors as its best parameters. K-Nearest Neighbor scored nearly perfect on training data, but there was a significant drop in performance on the testing data, which scored 87.95%. This is consistent with the cv model's score and standard deviation (87.83% +- 1.5% std dv). (The testing score of KNN improved by roughly 26% using a Tfidf vectorizer instead of a count vectorizer.) Because of the discrepancy between training and testing scores, K-Nearest Neighbors would probably not be the best choice in classifying clickbait.
For the Multinomial Naive Bayes classifier, cross validation chose 0.5 as its best alpha parameter. Multinomial Naive Bayes scored well on both training and testing at 98.19% and 95.85% respectively. Testing scores are also very close to the cv model's score and standard deviation (95.59% +- 0.2% std dv).
For the Multilayer Perceptron classifier, cross validation chose hidden layer size: [30, 15] as its best parameter. It scored perfectly on the training data and did well on the testing data with 95.44% accuracy. The test scores are very close to the cv model's score and standard deviation (95.04% +- 0.33%).
The Multinomial Naive Bayes and Multilayer Perceptron had similar scores on the testing data. It's possible their performance could be even more improved by further editing the vocabulary. Both would be useful as a plugin for social media, web browsers, or news sites to prevent unwanted ads.