In [1]:
import re
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from nltk.stem import WordNetLemmatizer

# Tensorflow libraries
# Tensorflow libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import text, sequence
# from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers

import tensorflow_hub as hub


# sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import roc_auc_score

from gensim.models import Word2Vec # Word2Vec module
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, remove_stopwords, strip_numeric, stem_text


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-news-content-detection/sample submission.csv
/kaggle/input/fake-news-content-detection/train.csv
/kaggle/input/fake-news-content-detection/test.csv


## Load Dataset

In [2]:
train_data = pd.read_csv("/kaggle/input/fake-news-content-detection/train.csv")
test_data = pd.read_csv("/kaggle/input/fake-news-content-detection/test.csv")
submission_data = pd.read_csv("/kaggle/input/fake-news-content-detection/sample submission.csv")

In [3]:
# Sample data from training data
train_data.sample(3)


Unnamed: 0,Labels,Text,Text_Tag
6674,0,"Because of Democratic tax hikes, New Jersey en...","jobs,taxes"
3891,3,Says the 2011-13 state budget eliminates the s...,state-budget
8783,4,"Says that President Obama said, Nobody made th...","military,veterans"


In [4]:
# Dataset information
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10240 entries, 0 to 10239
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Labels    10240 non-null  int64 
 1   Text      10240 non-null  object
 2   Text_Tag  10238 non-null  object
dtypes: int64(1), object(2)
memory usage: 240.1+ KB


In [5]:
train_data[train_data.duplicated(['Text'])]

Unnamed: 0,Labels,Text,Text_Tag
1014,2,On abortion,"abortion,candidates-biography"
1814,1,On support for gay marriage.,"civil-rights,families,gays-and-lesbians,marriage"
1846,1,"Obama says Iran is a 'tiny' country, 'doesn't ...",foreign-policy
2697,1,On repealing the 17th Amendment,"debates,elections,states"
2846,3,"Four balanced budgets in a row, with no new ta...","job-accomplishments,jobs,state-budget,state-fi..."
3256,1,On a cap-and-trade plan.,"cap-and-trade,climate-change,environment"
4386,1,On the Trans-Pacific Partnership.,trade
4839,2,During Sherrod Browns past decade as a D.C. po...,"economy,job-accomplishments,jobs"
4940,1,On changing the rules for filibusters on presi...,congressional-rules
6759,2,On torture.,"human-rights,terrorism"


In [6]:
train_data = train_data.drop_duplicates(['Text'])

In [7]:
train_data.sample(3)

Unnamed: 0,Labels,Text,Text_Tag
1101,2,"The wages of about 60,562 state and local gove...",economy
2263,1,Says he got lobbied by a woman with late-stage...,"health-care,public-health"
6334,2,We cut taxes 24 times.,taxes


## Split data into Train and Test

In [8]:
train_data['NewsText'] = train_data['Text_Tag'].astype(str) +" "+ train_data['Text']
test_data['NewsText'] = test_data['Text_Tag'].astype(str) +" "+ test_data['Text']

In [9]:
# Stemmer object
wnl = WordNetLemmatizer()

class DataPreprocess:
    
    def __init__(self):
        self.filters = [strip_tags,
                       strip_numeric,
                       strip_punctuation,
                       lambda x: x.lower(),
                       lambda x: re.sub(r'\s+\w{1}\s+', '', x),
                       remove_stopwords]
    def __call__(self, doc):
        clean_words = self.__apply_filter(doc)
        return clean_words
    
    def __apply_filter(self, doc):
        try:
            cleanse_words = set(preprocess_string(doc, self.filters))
            filtered_words = set(wnl.lemmatize(w, 'v') for w in cleanse_words)
            return ' '.join(cleanse_words)
        except TypeError as te:
            raise(TypeError("Not a valid data {}".format(te)))

In [10]:
train_data['Processed'] = train_data['NewsText'].apply(DataPreprocess())
test_data['Processed'] = test_data['NewsText'].apply(DataPreprocess())

# train_data['Processed'] = train_data['Text'].apply(DataPreprocess())
# test_data['Processed'] = test_data['Text'].apply(DataPreprocess())

In [11]:
test_data['Processed']

0       immigration literally years thes border buildi...
1          wisconsin number jobs double pace year layoffs
2       vets record help says veterans military mccain...
3       message bonamici choice cut supportsplan medic...
4       hes byreporter scheme issues scott violate nod...
                              ...                        
1262    says education state history provides budget h...
1263          rights ive crime criminal civil day justice
1264    disarmament secretly history kennedy counter p...
1265    efficiency epa got strickland says new days en...
1266    lower taxes cut fund says talking going higher...
Name: Processed, Length: 1267, dtype: object

In [12]:
X = train_data['Processed']
y = train_data['Labels']

y_category = keras.utils.to_categorical(y, 6)

# Split data into Train and Holdout as 80:20 ratio
X_train, X_valid, y_train, y_valid = train_test_split(X, y_category, shuffle=True, test_size=0.33, random_state=111)

print("Train shape : {}, Holdout shape: {}".format(X_train.shape, X_valid.shape))

Train shape : (6849,), Holdout shape: (3374,)


## Compute class weights

In [13]:
def word_embedding(train, test, max_features, max_len=200):
    try:
        # Keras Tokenizer class object
        tokenizer = text.Tokenizer(num_words=max_features)
        tokenizer.fit_on_texts(train)
        
        train_data = tokenizer.texts_to_sequences(train)
        test_data = tokenizer.texts_to_sequences(test)
        
        # Get the max_len
        vocab_size = len(tokenizer.word_index) + 1
        
        # Padd the sequence based on the max-length
        x_train = sequence.pad_sequences(train_data, maxlen=max_len, padding='post')
        x_test = sequence.pad_sequences(test_data, maxlen=max_len, padding='post')
        # Return train, test and vocab size
        return tokenizer, x_train, x_test, vocab_size
    except ValueError as ve:
        raise(ValueError("Error in word embedding {}".format(ve)))


In [14]:
max_features = 5000
max_len = 128
output_dim = len(np.unique(y))

# Test data
X_test = test_data['Processed']

tokenizer, x_pad_train, x_pad_valid, vocab_size = word_embedding(X_train, X_valid, max_features)

In [15]:
# Test data
X_test = test_data['Processed']

tokenizer.fit_on_sequences(X_test)

X_test_seq = tokenizer.texts_to_sequences(X_test)
x_pad_test = sequence.pad_sequences(X_test_seq, maxlen=max_len, padding='post')

In [16]:
def compute_classweights(target):
    """
    Computes the weights of the target values based on the samples
    :param target: Y-target variable
    :return: dictionary object
    """
    # compute class weights
    class_weights = class_weight.compute_class_weight('balanced',
                                                     np.unique(target),
                                                     target)
    
    # make the class weight list into dictionary
    weights = {}
    
    # enumerate the list
    for index, weight in enumerate(class_weights):
        weights[index] = weight
        
    return weights

# Get the class weights for the target variable
weights = compute_classweights(y)

In [17]:
weights

{0: 1.0307521677757612,
 1: 0.8574903539674551,
 2: 0.8078868342026236,
 3: 0.868859425463199,
 4: 2.0307906237584428,
 5: 1.017821585025886}

In [18]:
X_train.sample(3)

1641    medicaid message nursing raised machine times ...
5148    care exchanges signed people obamacare insuran...
7225                            stimulus economy economic
Name: Processed, dtype: object

In [19]:
def build_rnn(vocab_size, output_dim, max_len):
    # Building RNN model
    model = Sequential([
        keras.layers.Embedding(vocab_size,128,
                              input_length=max_len),
        keras.layers.BatchNormalization(),
#         keras.layers.Bidirectional(keras.layers.LSTM(128,return_sequences=True)),
        keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.002)),
        keras.layers.GlobalMaxPool1D(), # Remove flatten layer
        keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.002)),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.002)),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(output_dim, activation='softmax')
    ])

    return model

In [20]:
rnn_model = build_rnn(vocab_size, output_dim, max_len)

# Summary of the model
rnn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 128, 128)          1515520   
_________________________________________________________________
batch_normalization (BatchNo (None, 128, 128)          512       
_________________________________________________________________
dense (Dense)                (None, 128, 128)          16512     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2

In [21]:
# Compile the model
rnn_model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), 
                  loss=keras.losses.CategoricalCrossentropy(from_logits=True), 
                  metrics=[tf.metrics.AUC()])

In [22]:
history = rnn_model.fit(x_pad_train, 
                        y_train,
                        batch_size=512,
                        epochs=20,
                        verbose=1,
                        validation_data=(x_pad_valid, y_valid),
                       class_weight=weights)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [23]:
results = rnn_model.evaluate(x_pad_valid, y_valid)



In [24]:
y_preds = rnn_model.predict_proba(x_pad_test, batch_size=256)

In [25]:
y_preds[:,0]

array([0.12039722, 0.1146237 , 0.25214398, ..., 0.21181321, 0.16211513,
       0.14615408], dtype=float32)

In [26]:
final_df = pd.DataFrame({'0': y_preds[:,0],
                        '1': y_preds[:,1],
                        '2': y_preds[:,2],
                        '3': y_preds[:,3],
                        '4': y_preds[:,4],
                        '5': y_preds[:,5]}, index=test_data.index)

In [27]:
final_df

Unnamed: 0,0,1,2,3,4,5
0,0.120397,0.194573,0.123343,0.257846,0.104776,0.199065
1,0.114624,0.401864,0.266005,0.065945,0.104306,0.047255
2,0.252144,0.112122,0.107202,0.109996,0.279368,0.139168
3,0.166046,0.265240,0.096905,0.070708,0.305409,0.095692
4,0.216320,0.348999,0.106750,0.074505,0.100769,0.152656
...,...,...,...,...,...,...
1262,0.225529,0.113677,0.144991,0.203735,0.192256,0.119813
1263,0.182773,0.146543,0.097273,0.185975,0.119684,0.267753
1264,0.211813,0.145919,0.136529,0.205962,0.095236,0.204540
1265,0.162115,0.098561,0.081593,0.250268,0.247120,0.160343


In [29]:
final_df.to_csv("fake_news_ann_08.csv", index=False)