In [1]:
import numpy as np 
import pandas as pd 
import tqdm
import string 

In [2]:
train = pd.read_csv('news_train.csv')
test = pd.read_csv('news_test.csv') 
submission = pd.read_csv('sample_submission.csv') 

In [3]:
import seaborn as sns
import xgboost as xgb
import lightgbm as lgbm
from sklearn.svm import LinearSVC 
from sklearn.calibration import CalibratedClassifierCV 

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn import ensemble, metrics, model_selection, naive_bayes

In [5]:
import glob
import pandas as pd
from tensorflow import keras
import numpy as np
import os 
from sklearn.model_selection import StratifiedKFold, KFold
import matplotlib.pylab as plt
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.layers import TimeDistributed, Conv2D, Conv2DTranspose, MaxPooling2D, AveragePooling2D, BatchNormalization, concatenate, Input, ConvLSTM2D, Reshape, Conv3D, Flatten, LSTM, GRU, Dense,Dropout, Add
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Bidirectional, Conv1D, MaxPooling1D, GlobalMaxPooling1D, GlobalMaxPool1D
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.models import Sequential, load_model
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
import re 

import nltk # for stopwords 
from nltk.corpus import stopwords
import gensim # for Word2Vec embeddings 
from sklearn.feature_extraction.text import CountVectorizer

from konlpy.tag import Mecab, Hannanum

### Extract Meta Features

In [6]:
mecab = Mecab()

In [7]:
train['title_tokenized'] = train['title'].apply(lambda x: mecab.morphs(x))

In [8]:
train['content_tokenized'] = train['content'].apply(lambda x: mecab.morphs(x))

In [9]:
test['title_tokenized'] = test['title'].apply(lambda x: mecab.morphs(x))
test['content_tokenized'] = test['content'].apply(lambda x: mecab.morphs(x))

In [10]:
# number of words in title
train['num_words_title'] = train['title_tokenized'].apply(lambda x: len(x)) 
test['num_words_title'] = test['title_tokenized'].apply(lambda x: len(x))   

# number of words in content 
train['num_words_content'] = train['content_tokenized'].apply(lambda x: len(x)) 
test['num_words_content'] = test['content_tokenized'].apply(lambda x: len(x)) 

In [11]:
# number of unique words in title 
train['title_num_unique'] = train['title_tokenized'].apply(lambda x: len(set(x))) 
test['title_num_unique'] = test['title_tokenized'].apply(lambda x: len(set(x))) 

# number of unique words in content 
train['content_num_unique'] = train['content_tokenized'].apply(lambda x: len(set(x))) 
test['content_num_unique'] = test['content_tokenized'].apply(lambda x: len(set(x))) 

In [12]:
# number of punctuations in text 
train['num_punctuations_title'] = train['title_tokenized'].apply(lambda x: len([c for c in x if c in string.punctuation]))
test['num_punctuations_title'] = test['title_tokenized'].apply(lambda x: len([c for c in x if c in string.punctuation]))

train['num_punctuations_content'] = train['content_tokenized'].apply(lambda x: len([c for c in x if c in string.punctuation]))
test['num_punctuations_content'] = test['content_tokenized'].apply(lambda x: len([c for c in x if c in string.punctuation]))

In [13]:
# [ 또는 (로 시작하는지에 대한 여부 
train["title_startswith_["]=train.title.apply(lambda x : str(x).startswith("[" ) or str(x).startswith("(")) 
train["content_startswith_["]=train.content.apply(lambda x : str(x).startswith("[" ) or str(x).startswith("(") ) 
test["title_startswith_["]=test.title.apply(lambda x : str(x).startswith("[" ) or str(x).startswith("(")) 
test["content_startswith_["]=test.content.apply(lambda x : str(x).startswith("[" ) or str(x).startswith("(") ) 


# ] 또는 )로 시작하는지에 대한 여부 
train["title_endswith_]"]=train.title.apply(lambda x : str(x).endswith("]" ) or str(x).endswith(")"))
train["content_endswith_]"]=train.content.apply(lambda x : str(x).endswith("]" ) or str(x).endswith(")") )
test["title_endswith_]"]=test.title.apply(lambda x : str(x).endswith("]" ) or str(x).endswith(")"))
test["content_endswith_]"]=test.content.apply(lambda x : str(x).endswith("]" ) or str(x).endswith(")") )


# ' 로 시작하는지에 대한 여부 
train["title_startswith_quote"]=train.title.apply(lambda x : str(x).startswith('"'))
train["content_startswith_quote"]=train.content.apply(lambda x : str(x).startswith('"'))
test["title_startswith_quote"]=test.title.apply(lambda x : str(x).startswith('"'))
test["content_startswith_quote"]=test.content.apply(lambda x : str(x).startswith('"'))


# '로 끝나는지에 대한 여부
train["title_endswith_quote"]=train.title.apply(lambda x : str(x).endswith('"'))
train["content_endswith_quote"]=train.content.apply(lambda x : str(x).endswith('"'))
test["title_endswith_quote"]=test.title.apply(lambda x : str(x).endswith('"'))
test["content_endswith_quote"]=test.content.apply(lambda x : str(x).endswith('"'))


# 숫자로 시작하는지에 대한 여부 
train["title_startswith_number"]=train.title.apply(lambda x : str(x)[0].isdigit())
train["content_startswith_number"]=train.content.apply(lambda x : str(x)[0].isdigit())
test["title_startswith_number"]=test.title.apply(lambda x : str(x)[0].isdigit())
test["content_startswith_number"]=test.content.apply(lambda x : str(x)[0].isdigit())


# 숫자로 끝나는지에 대한 여부 
train["title_endswith_number"]=train.title.apply(lambda x : str(x)[-1].isdigit())
train["content_endswith_number"]=train.content.apply(lambda x : str(x)[-1].isdigit())
test["title_endswith_number"]=test.title.apply(lambda x : str(x)[-1].isdigit())
test["content_endswith_number"]=test.content.apply(lambda x : str(x)[-1].isdigit())



In [14]:
# title length 
train["title_length"] = train['title'].apply(lambda x : len(x))
test["title_length"] = test['title'].apply(lambda x : len(x))

In [15]:
# content length 
train['content_length'] = train['content'].apply(lambda x: len(x))
test['content_length'] = test['content'].apply(lambda x: len(x))

In [16]:
# average length of word in title 
train['title_mean_length'] = train['title_tokenized'].apply(lambda x: np.mean([len(w) for w in x])) 
test['title_mean_length'] = test['title_tokenized'].apply(lambda x: np.mean([len(w) for w in x])) 

In [17]:
train.shape, test.shape

((118745, 29), (142565, 29))

### Extract Text Based Features

#### Extract TF-IDF 1-3 ngram features, then apply truncated SVD to incorporate into our dataframe

In [18]:
def mecab_tokenizer(text):  
    tokens_mecab = mecab.morphs(text) 
    return tokens_mecab

In [19]:
# used for singular value decomposition  
tfidf_vec = TfidfVectorizer(tokenizer = mecab_tokenizer, ngram_range = (1,3)) 
full_tfidf_title = tfidf_vec.fit_transform(train['title'].values.tolist() + test['title'].values.tolist())
train_tfidf_title = tfidf_vec.transform(train['title'].values.tolist()) 
test_tfidf_title = tfidf_vec.transform(test['title'].values.tolist())

In [20]:
full_tfidf_title.shape, train_tfidf_title.shape, test_tfidf_title.shape 

((261310, 170537), (118745, 170537), (142565, 170537))

In [21]:
# used for singular value decomposition  
tfidf_vec = TfidfVectorizer(tokenizer = mecab_tokenizer, ngram_range = (1,3)) 
full_tfidf_content = tfidf_vec.fit_transform(train['content'].values.tolist() + test['content'].values.tolist())
train_tfidf_content = tfidf_vec.transform(train['content'].values.tolist()) 
test_tfidf_content = tfidf_vec.transform(test['content'].values.tolist())

In [22]:
full_tfidf_content.shape, train_tfidf_content.shape, test_tfidf_content.shape 

((261310, 2465227), (118745, 2465227), (142565, 2465227))

In [23]:
svd_obj = TruncatedSVD(n_components = 20, algorithm = 'arpack') 
svd_obj.fit(full_tfidf_title)
train_svd_title = pd.DataFrame(svd_obj.transform(train_tfidf_title)) 
test_svd_title = pd.DataFrame(svd_obj.transform(test_tfidf_title))

In [24]:
train_svd_title.columns = ['svd_word_title_' + str(i) for i in range(20)] 
test_svd_title.columns = ['svd_word_title_' + str(i) for i in range(20)] 

train = pd.concat([train, train_svd_title], axis = 1) 
test = pd.concat([test, test_svd_title], axis = 1)  

# del full_tfidf_title, train_tfidf_title, test_tfidf_title, train_svd_title, test_svd_title

In [25]:
svd_obj = TruncatedSVD(n_components = 20, algorithm = 'arpack') 
svd_obj.fit(full_tfidf_content) 
train_svd_content = pd.DataFrame(svd_obj.transform(train_tfidf_content)) 
test_svd_content = pd.DataFrame(svd_obj.transform(test_tfidf_content)) 

In [26]:
train_svd_content.columns = ['svd_word_content_' + str(i) for i in range(20)] 
test_svd_content.columns = ['svd_word_content_' + str(i) for i in range(20)] 

train = pd.concat([train, train_svd_content], axis = 1) 
test = pd.concat([test, test_svd_content], axis = 1)  

# del full_tfidf_content, train_tfidf_content, test_tfidf_content, train_svd_content, test_svd_content

In [27]:
# checking the dataframe shape 
train.shape, test.shape 

((118745, 69), (142565, 69))

In [28]:
# delete unnecessary dataframes 
del train_svd_title, test_svd_title, train_svd_content, test_svd_content, full_tfidf_title, full_tfidf_content

#### Extract Count vectorizer 1-3 ngram, then fit MNB and SVC to add to our dataframe 

In [29]:
cvec_full = CountVectorizer(tokenizer = mecab_tokenizer, ngram_range = (1,3)) 
cvec_full.fit(train['title'].values.tolist() + test['title'].values.tolist()) 
train_cvec_title = cvec_full.transform(train['title'].values.tolist()) 
test_cvec_title = cvec_full.transform(test['title'].values.tolist()) 



In [30]:
train_cvec_title.shape, test_cvec_title.shape

((118745, 170537), (142565, 170537))

In [31]:
cvec_full_content = CountVectorizer(tokenizer = mecab_tokenizer, ngram_range = (1,3)) 
cvec_full_content.fit(train['content'].values.tolist() + test['content'].values.tolist()) 
train_cvec_content = cvec_full_content.transform(train['content'].values.tolist()) 
test_cvec_content = cvec_full_content.transform(test['content'].values.tolist()) 

In [32]:
train_cvec_content.shape, test_cvec_content.shape 

((118745, 2465227), (142565, 2465227))

In [33]:
def runMNB(train_x, train_y, test_x, test_y, test_x2):  
    model = MultinomialNB()
    model.fit(train_x, train_y)
    pred_test_y = model.predict_proba(test_x) 
    pred_test_y2 = model.predict_proba(test_x2) 
    return pred_test_y, pred_test_y2, model

In [34]:
y_train = train['info'] 
y_train = np.asarray(y_train)
y_train.shape

(118745,)

### MNB for tifidf title matrix

In [35]:
cv_scores = [] 
pred_full_test = 0 
pred_train = np.zeros([train.shape[0],2]) 
kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 990101) 
for train_idx, val_idx in kf.split(train, y_train):
    train_x, val_x = train_tfidf_title[train_idx], train_tfidf_title[val_idx] 
    train_y, val_y = y_train[train_idx], y_train[val_idx] 
    pred_val_y, pred_test_y, model = runMNB(train_x, train_y, val_x, val_y, test_tfidf_title) 
    pred_full_test = pred_full_test + pred_test_y 
    pred_train[val_idx,:] = pred_val_y   
pred_full_test /= 5.0 


In [36]:
train['nb_tfidf_real_title'] = pred_train[:,0] 
train['nb_tfidf_fake_title'] = pred_train[:,1] 
test['nb_tfidf_real_title'] = pred_full_test[:,0] 
test['nb_tfidf_fake_title'] = pred_full_test[:,1]

### MNB for tfidif content matrix

In [37]:
cv_scores = [] 
pred_full_test = 0 
pred_train = np.zeros([train.shape[0],2]) 
kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 990101) 
for train_idx, val_idx in kf.split(train, y_train):
    train_x, val_x = train_tfidf_content[train_idx], train_tfidf_content[val_idx] 
    train_y, val_y = y_train[train_idx], y_train[val_idx] 
    pred_val_y, pred_test_y, model = runMNB(train_x, train_y, val_x, val_y, test_tfidf_content) 
    pred_full_test = pred_full_test + pred_test_y 
    pred_train[val_idx,:] = pred_val_y   
pred_full_test /= 5.0 


In [38]:
train['nb_tfidf_real_content'] = pred_train[:,0] 
train['nb_tfidf_fake_content'] = pred_train[:,1] 
test['nb_tfidf_real_content'] = pred_full_test[:,0]  
test['nb_tfidf_fake_content'] = pred_full_test[:,1]

In [39]:
train.shape, test.shape

((118745, 73), (142565, 73))

### MNB for cvec title

In [40]:
cv_scores = [] 
pred_full_test = 0 
pred_train = np.zeros([train.shape[0],2]) 
kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 990101) 
for train_idx, val_idx in kf.split(train, y_train):
    train_x, val_x = train_cvec_title[train_idx], train_cvec_title[val_idx] 
    train_y, val_y = y_train[train_idx], y_train[val_idx] 
    pred_val_y, pred_test_y, model = runMNB(train_x, train_y, val_x, val_y, test_cvec_title) 
    pred_full_test = pred_full_test + pred_test_y 
    pred_train[val_idx,:] = pred_val_y   
pred_full_test /= 5.0 


In [41]:
train['nb_cvec_real_title'] = pred_train[:,0] 
train['nb_cvec_fake_title'] = pred_train[:,1] 
test['nb_cvec_real_title'] = pred_full_test[:,0]  
test['nb_cvec_fake_title'] = pred_full_test[:,1]

### MNB for cvec content 

In [42]:
cv_scores = [] 
pred_full_test = 0 
pred_train = np.zeros([train.shape[0],2]) 
kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 990101) 
for train_idx, val_idx in kf.split(train, y_train):
    train_x, val_x = train_cvec_content[train_idx], train_cvec_content[val_idx] 
    train_y, val_y = y_train[train_idx], y_train[val_idx] 
    pred_val_y, pred_test_y, model = runMNB(train_x, train_y, val_x, val_y, test_cvec_content) 
    pred_full_test = pred_full_test + pred_test_y 
    pred_train[val_idx,:] = pred_val_y   
pred_full_test /= 5.0 


In [43]:
train['nb_cvec_real_content'] = pred_train[:,0] 
train['nb_cvec_fake_content'] = pred_train[:,1] 
test['nb_cvec_real_content'] = pred_full_test[:,0]  
test['nb_cvec_fake_content'] = pred_full_test[:,1] 

In [44]:
print("Final shape of the dataframe") 
train.shape, test.shape

Final shape of the dataframe


((118745, 77), (142565, 77))

### Train LightGBM Model 

We will begin training at this step. We use the light gbm model for now as it is fast and also pretty powerful. 

In [46]:
x_train = train.drop(['n_id','info','title','content','title_tokenized','content_tokenized'], axis = 1) 
y_train = train['info'] 

x_test = test.drop(['n_id','id','title','content','title_tokenized','content_tokenized'], axis = 1) 

In [47]:
x_train = np.asarray(x_train) 
y_train = np.asarray(y_train) 
x_test = np.asarray(x_test) 

x_train.shape, y_train.shape, x_test.shape 

((118745, 71), (118745,), (142565, 71))

In [48]:
# save dataframe 
np.save('x_train_latest.npy',x_train) 
np.save('y_train_latest.npy',y_train) 
np.save('x_test_latest.npy',x_test)

In [49]:
k = 5 
models = [] 
kfold = StratifiedKFold(n_splits = k, shuffle = True, random_state = 990101) 
for n_fold, (train_idx, val_idx) in enumerate(kfold.split(x_train, y_train)): 
    train_x, val_x = x_train[train_idx], x_train[val_idx]
    train_y, val_y = y_train[train_idx], y_train[val_idx] 
    
    params = {'learning_rate': 0.01,
              'max_depth': 16, 
              'objective': 'binary',
              'metric': 'binary_logloss',
              'is_training_metric': True,
              'num_leaves': 128,
              'feature_fraction': 0.9,
              'bagging_fraction': 0.75, 
              'bagging_freq': 5,
              'seed': 960418} 
    
    train_ds = lgbm.Dataset(train_x, label = train_y) 
    val_ds = lgbm.Dataset(val_x, label = val_y) 
    model = lgbm.train(params, train_ds, 1000, val_ds, verbose_eval = 10, early_stopping_rounds = 100) 
    models.append(model) 

[LightGBM] [Info] Number of positive: 37546, number of negative: 57450
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13493
[LightGBM] [Info] Number of data points in the train set: 94996, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.395238 -> initscore=-0.425348
[LightGBM] [Info] Start training from score -0.425348
Training until validation scores don't improve for 100 rounds
[10]	valid_0's binary_logloss: 0.583261
[20]	valid_0's binary_logloss: 0.511219
[30]	valid_0's binary_logloss: 0.450686
[40]	valid_0's binary_logloss: 0.399565
[50]	valid_0's binary_logloss: 0.355451
[60]	valid_0's binary_logloss: 0.317338
[70]	valid_0's binary_logloss: 0.284186
[80]	valid_0's binary_logloss: 0.255177
[90]	valid_0's binary_logloss: 0.229712
[100]	valid_0's binary_logloss: 0.207269
[110]	valid_0's binary_logloss: 0.187406
[120]	valid_0's binary_logloss: 0.169756


[670]	valid_0's binary_logloss: 0.0113049
[680]	valid_0's binary_logloss: 0.01119
[690]	valid_0's binary_logloss: 0.0111126
[700]	valid_0's binary_logloss: 0.0110243
[710]	valid_0's binary_logloss: 0.0109368
[720]	valid_0's binary_logloss: 0.0108497
[730]	valid_0's binary_logloss: 0.0107712
[740]	valid_0's binary_logloss: 0.0107122
[750]	valid_0's binary_logloss: 0.0106566
[760]	valid_0's binary_logloss: 0.0105969
[770]	valid_0's binary_logloss: 0.0105451
[780]	valid_0's binary_logloss: 0.0104796
[790]	valid_0's binary_logloss: 0.0104215
[800]	valid_0's binary_logloss: 0.0103993
[810]	valid_0's binary_logloss: 0.0103787
[820]	valid_0's binary_logloss: 0.0103234
[830]	valid_0's binary_logloss: 0.0102729
[840]	valid_0's binary_logloss: 0.0102306
[850]	valid_0's binary_logloss: 0.010212
[860]	valid_0's binary_logloss: 0.0101782
[870]	valid_0's binary_logloss: 0.0101578
[880]	valid_0's binary_logloss: 0.010137
[890]	valid_0's binary_logloss: 0.0100807
[900]	valid_0's binary_logloss: 0.0100

[310]	valid_0's binary_logloss: 0.0391982
[320]	valid_0's binary_logloss: 0.0370112
[330]	valid_0's binary_logloss: 0.0350093
[340]	valid_0's binary_logloss: 0.0331881
[350]	valid_0's binary_logloss: 0.0315374
[360]	valid_0's binary_logloss: 0.0300673
[370]	valid_0's binary_logloss: 0.0287043
[380]	valid_0's binary_logloss: 0.0274822
[390]	valid_0's binary_logloss: 0.0263705
[400]	valid_0's binary_logloss: 0.0253763
[410]	valid_0's binary_logloss: 0.0244338
[420]	valid_0's binary_logloss: 0.0234847
[430]	valid_0's binary_logloss: 0.022652
[440]	valid_0's binary_logloss: 0.0219171
[450]	valid_0's binary_logloss: 0.0212138
[460]	valid_0's binary_logloss: 0.0206162
[470]	valid_0's binary_logloss: 0.0200923
[480]	valid_0's binary_logloss: 0.0195468
[490]	valid_0's binary_logloss: 0.0190835
[500]	valid_0's binary_logloss: 0.0186559
[510]	valid_0's binary_logloss: 0.0182476
[520]	valid_0's binary_logloss: 0.0178784
[530]	valid_0's binary_logloss: 0.0175164
[540]	valid_0's binary_logloss: 0.0

In [50]:
pred1 = models[0].predict(x_test) 
pred2 = models[1].predict(x_test) 
pred3 = models[2].predict(x_test) 
pred4 = models[3].predict(x_test) 
pred5 = models[4].predict(x_test) 

In [51]:
pred_avg = (pred1 + pred2 + pred3 + pred4 + pred5)/5.0  

In [52]:
class_pred = np.where(pred_avg > 0.5, 1, 0).reshape(-1) 

In [54]:
submission['info'] = class_pred

In [57]:
submission.to_csv('lightgbm_71_features.csv', index = False) 