In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [47]:
# Load our data into two Python lists
with open("clickbait.txt") as f:
    lines = f.read().strip().split("\n")
    lines = [line.split("\t") for line in lines]
headlines, labels = zip(*lines)

In [48]:
headlines[:5]

("Egypt's top envoy in Iraq confirmed killed",
 'Carter: Race relations in Palestine are worse than apartheid',
 'After Years Of Dutiful Service, The Shiba Who Ran A Tobacco Shop Retires',
 'In Books on Two Powerbrokers, Hints of the Future',
 'These Horrifyingly Satisfying Photos Of "Baby Foot" Will Haunt You')

In [49]:
labels[:5]

('0', '0', '1', '0', '1')

In [50]:
# How big is our dataset?
len(headlines)

10000

In [51]:
# Break dataset into test and train python
train_headlines = headlines[:8000]
test_headlines = headlines[8000:]

train_labels = labels[:8000]
test_labels = labels[8000:]

In [52]:
# Create a vectorizer and classifier
vectorizer = TfidfVectorizer()
svm = LinearSVC()

In [53]:
# Transform our text data into numerical vectors
train_vectors = vectorizer.fit_transform(train_headlines)
test_vectors = vectorizer.transform(test_headlines)

In [54]:
# Train the classifier and predict on test set
svm.fit(train_vectors, train_labels)
predictions = svm.predict(test_vectors)

In [55]:
test_headlines[0:5]

('The Earliest I\'ve Said "I Love You"',
 "Stop What You're Doing And Worship These Matt Bomer Pictures",
 '23 Of The Funniest "Nancy Drew" Game Memes',
 'Policeman killed in football-related violence in Italy',
 'Do You Remember Which Disney Star Sang These Lyrics')

In [56]:
predictions[:5]

array(['1', '1', '1', '0', '1'],
      dtype='<U1')

In [57]:
test_labels[:5]

('1', '1', '1', '0', '1')

In [58]:
accuracy_score(test_labels, predictions)

0.96099999999999997

In [59]:
new_headlines = ["10 Cities That Every Hipster Will Be Moving To Soon", 'Vice President Mike Pence Leaves NFL Game Saying Players Showed "Disrespect" Of Anthem, Flag']
new_vectors = vectorizer.transform(new_headlines)
new_predictions = svm.predict(new_vectors)

In [60]:
new_predictions

array(['1', '0'],
      dtype='<U1')