In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk   
import matplotlib.pyplot as plt            # library for visualization
import random                              # pseudo-random number generator
import os


In [2]:
DATA_DIR = "../input/clickbait-dataset/"

In [3]:
train_df = pd.read_csv(DATA_DIR + 'clickbait_data.csv')
train_df.head()

Unnamed: 0,headline,clickbait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1


In [4]:
train_df.loc[train_df['clickbait']==0].shape, train_df.loc[train_df['clickbait']==1].shape

((16001, 2), (15999, 2))

In [5]:
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

# Preprocessing

## Tokenizing

In [6]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)

# tokenize headlines
train_df['headline'] = train_df['headline'].apply(tokenizer.tokenize)

train_df.head()

Unnamed: 0,headline,clickbait
0,"[should, i, get, bings]",1
1,"[which, tv, female, friend, group, do, you, be...",1
2,"[the, new, "", star, wars, :, the, force, awake...",1
3,"[this, vine, of, new, york, on, "", celebrity, ...",1
4,"[a, couple, did, a, stunning, photo, shoot, wi...",1


## Removing Punctuation and lower-casing

In [13]:
stopwords_english = set(stopwords.words('english'))
def clean_words(headline):
    return [
    word for word in headline
    if word.lower() not in stopwords_english
]  

In [14]:
train_df['headline'] = train_df['headline'].apply(clean_words)

In [15]:
train_df.head()

Unnamed: 0,headline,clickbait
0,"[get, bings]",1
1,"[tv, female, friend, group, belong]",1
2,"[new, "", star, wars, :, force, awakens, "", tra...",1
3,"[vine, new, york, "", celebrity, big, brother, ...",1
4,"[couple, stunning, photo, shoot, baby, learnin...",1


## Stemming

In [16]:
stemmer = PorterStemmer()
def words_stems(headline):
    return [
    stemmer.stem(word) for word in headline
]  

In [18]:
train_df['headline'] = train_df['headline'].apply(words_stems)

In [19]:
train_df.head()

Unnamed: 0,headline,clickbait
0,"[get, bing]",1
1,"[tv, femal, friend, group, belong]",1
2,"[new, "", star, war, :, forc, awaken, "", traile...",1
3,"[vine, new, york, "", celebr, big, brother, "", ...",1
4,"[coupl, stun, photo, shoot, babi, learn, inope...",1
