In [170]:
from gensim.models import Word2Vec, KeyedVectors
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import os
import re
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
import statsmodels.api as sm
%matplotlib inline

In [3]:
with open('/Users/skylark/Desktop/Misc/glove.6B/glove.6B.100d.txt') as f:
    glove = f.readlines()

In [8]:
glove_weights = {}
for weight in glove:
    glove_weights[weight.split()[0]] = np.array(weight.split()[1:])

In [10]:
worldnews = pd.read_csv('/Users/skylark/Desktop/Misc/world news/reddit_worldnews_start_to_2016-11-22.csv')
worldnews.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,subreddit
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [21]:
target = (worldnews.over_18).astype('int32')

In [40]:
x = pd.DataFrame({'title':worldnews.title})

In [56]:
pat1 = re.compile(r'\w{2,}')
pat2 = re.compile(r'\D')

In [59]:
x['updated_title'] = x.title.map(lambda x: ' '.join(pat1.findall(x.lower())))

In [60]:
x['updated_title'] = x.updated_title.map(lambda x: ''.join(pat2.findall(x.lower())))

### Text analytics - sklearn

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TreebankWordTokenizer

In [67]:
stop_words = stopwords.words('english')

In [78]:
stemmer = PorterStemmer()
toknzr = TreebankWordTokenizer()

In [82]:
def analyser(text):
    stemmed = [stemmer.stem(word) for word in toknzr.tokenize(text) if word not in stop_words]
    return stemmed

In [88]:
tfidf = TfidfVectorizer(analyzer=analyser, max_features=500)

In [92]:
tfidf.fit(x.updated_title)

TfidfVectorizer(analyzer=<function analyser at 0x1e7721320>, binary=False,
                decode_error='strict', dtype=<class 'numpy.float64'>,
                encoding='utf-8', input='content', lowercase=True, max_df=1.0,
                max_features=500, min_df=1, ngram_range=(1, 1), norm='l2',
                preprocessor=None, smooth_idf=True, stop_words=None,
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

In [95]:
features = tfidf.get_feature_names()

In [96]:
vectorizer = tfidf.transform(x.updated_title)

In [100]:
text_one_hot = pd.DataFrame(vectorizer.todense(), columns=features)

In [168]:
text_one_hot.head()

Unnamed: 0,abus,accord,accus,across,action,activist,afghan,afghanistan,africa,african,...,win,without,woman,women,work,worker,world,would,year,yemen
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [171]:
train_x, test_x, train_y, test_y = train_test_split(text_one_hot, target, test_size=0.3, random_state=100)

In [172]:
from sklearn.naive_bayes import BernoulliNB

In [173]:
bnb = BernoulliNB()
bnb.fit(train_x, train_y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [174]:
bnb.score(test_x, test_y)

0.9987693999515614

### Word2Vec

In [135]:
words = [toknzr.tokenize(sentence) for sentence in x.updated_title.to_list()]

In [141]:
model = Word2Vec(sentences=words)

In [142]:
wordvec = model.wv

In [154]:
vec = wordvec['king'] - wordvec['man'] + wordvec['woman']

In [164]:
model_pretrained = KeyedVectors.load_word2vec_format('/Users/skylark/Desktop/Misc/pretrained embeddings/GoogleNews-vectors-negative300.bin', binary=True, limit=10000)

In [191]:
len(model_pretrained['the'])

300

In [223]:
from keras import callbacks
from keras.preprocessing.text import Tokenizer
from keras import models, layers, regularizers, optimizers
from keras.preprocessing.sequence import pad_sequences

In [176]:
tokenizer = Tokenizer(num_words=10000)

In [177]:
tokenizer.fit_on_texts(x.updated_title)

In [178]:
sequences = tokenizer.texts_to_sequences(x.updated_title)

In [183]:
sequences = pad_sequences(sequences, maxlen=100)

In [196]:
word_idx = tokenizer.word_index

In [185]:
train_x, test_x, train_y, test_y = train_test_split(sequences, target, test_size=0.3, random_state=100)

In [224]:
call_backs = [callbacks.ModelCheckpoint('/Users/skylark/Desktop/Misc/world news/model.h5'),
             callbacks.EarlyStopping(patience=2)]

In [229]:
model = models.Sequential()

In [230]:
model.add(layers.Embedding(10000, 300, input_length=100))
model.add(layers.Bidirectional(layers.LSTM(64, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(128)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 300)          3000000   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 100, 128)          186880    
_________________________________________________________________
bidirectional_8 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_10 (Dense)             (None, 64)                16448     
_________________________________________________________________
dense_11 (Dense)             (None, 128)               8320      
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 129       
Total params: 3,474,945
Trainable params: 3,474,945
Non-trainable params: 0
____________________________________________

In [219]:
embedding_matrix = np.zeros((10000, 300))

In [220]:
for word, i in word_idx.items():
    if i<10000:
        try:
            vec = model_pretrained[word]
            embedding_matrix[i] = vec
        except Exception as e:
            pass

In [221]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [231]:
model.compile(optimizer=optimizers.RMSprop(2e-4), loss='binary_crossentropy', metrics=['acc'])
history = model.fit(train_x, train_y, batch_size=128, epochs=1, validation_split=0.3, callbacks=call_backs)

Train on 249525 samples, validate on 106940 samples
Epoch 1/1


In [232]:
model.evaluate(test_x, test_y)



[0.00643130465752787, 0.9992668628692627]

### GloVe

In [239]:
embedding_matrix_glove = np.zeros((10000, 100))
for word, i in word_idx.items():
    if i<10000:
        try:
            vec = glove_weights[word]
            embedding_matrix_glove[i] = vec
        except Exception as e:
            pass

In [241]:
model = models.Sequential()

In [242]:
model.add(layers.Embedding(10000, 100, input_length=100))
model.add(layers.Bidirectional(layers.LSTM(64, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(128)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
bidirectional_9 (Bidirection (None, 100, 128)          84480     
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 256)               263168    
_________________________________________________________________
dense_13 (Dense)             (None, 64)                16448     
_________________________________________________________________
dense_14 (Dense)             (None, 128)               8320      
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 129       
Total params: 1,372,545
Trainable params: 1,372,545
Non-trainable params: 0
____________________________________________

In [243]:
model.layers[0].set_weights([embedding_matrix_glove])
model.layers[0].trainable = False

In [244]:
model.compile(optimizer=optimizers.RMSprop(2e-4), loss='binary_crossentropy', metrics=['acc'])
history = model.fit(train_x, train_y, batch_size=128, epochs=1, validation_split=0.3, callbacks=call_backs)

Train on 249525 samples, validate on 106940 samples
Epoch 1/1


In [245]:
model.evaluate(test_x, test_y)



[0.005637606642458421, 0.9992668628692627]