In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Activation, Flatten
from keras.models import Model
from keras.initializers import Constant
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from gensim.models.keyedvectors import KeyedVectors

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
import nltk 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
df_real = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
df_fake = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

In [None]:
df_real.head()

In [None]:
df_fake.head()

In [None]:
df_real.isnull().sum()

In [None]:
df_fake.isnull().sum()

In [None]:
df_real.head()

In [None]:
df_real['text'][0]

In [None]:
df_real.shape

In [None]:
df_fake.shape

In [None]:
df_real['sentiment'] = 1
df_fake['sentiment'] = 0

In [None]:
df_real.shape

In [None]:
import re

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
def get_cleaned_data(input_data, mode='df'):
    stop = stopwords.words('english')
    
    input_df = ''
    
    if mode != 'df':
        input_df = pd.DataFrame([input_data], columns=['text'])
    else:
        input_df = input_data
        
    #lowercase the text
    input_df['text'] = input_df['text'].str.lower()
    
    input_df['text'] = input_df['text'].apply(lambda elem: decontracted(elem))
    
    #remove special characters
    input_df['text'] = input_df['text'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
    
    # remove numbers
    input_df['text'] = input_df['text'].apply(lambda elem: re.sub(r"\d+", "", elem))
    
    #remove stopwords
    input_df['text'] = input_df['text'].apply(lambda x: ' '.join([word.strip() for word in x.split() if word not in (stop)]))
    
   
    input_df['text'] = input_df['text'].apply(lambda words: (wordnet_lemmatizer.lemmatize(words)))

    
    return input_df

In [None]:
df_real = get_cleaned_data(df_real)
df_fake = get_cleaned_data(df_fake)

In [None]:
df_real.head()

In [None]:
df_real['text'][0]

In [None]:
data=pd.concat([df_real,df_fake],axis=0,ignore_index=True)

In [None]:
data.tail(10)

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.shape

In [None]:
g=[]

In [None]:
for i in data['text']:
    g.append(i)

In [None]:
maxl = max([len(s) for s in g])
print ('Maximum sequence length in the list of sentences:', maxl)

In [None]:
X=data['text']
Y=data['sentiment']

In [None]:
tokenizer=Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Padding Sequences

As the maxlength is very big we will be selecting 2000 as our maxlength..

In [None]:
X = tokenizer.texts_to_sequences(X.values)
X = pad_sequences(X, maxlen=2000)

In [None]:
Y = pd.get_dummies(data['sentiment'],columns=data["sentiment"]).values
Y

In [None]:
Y.shape

In [None]:
X.shape

# Splitting the Dataset..

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
X_train

# Now importing the pretrained embedding index from Google index..

In [None]:
path='/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'

In [None]:
 wv_from_bin = KeyedVectors.load_word2vec_format(path, binary=True, limit=500000) 
  #extracting word vectors from google news vector
 embeddings_index = {}
 for word, vector in zip(wv_from_bin.vocab, wv_from_bin.vectors):
      coefs = np.asarray(vector, dtype='float32')
      embeddings_index[word] = coefs

In [None]:
print('Found %s word vectors.' % len(embeddings_index))

In [None]:
vocab_size = len(tokenizer.word_index) + 1

In [None]:
print(vocab_size)

In [None]:
# embedding_matrix = np.zeros((vocab_size, 300))
# for word, i in word_index.items():
#     try:
#         embedding_vector = embeddings_index[word]
#         embedding_matrix[i] = embedding_vector
#     except KeyError:
#         embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),300)

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix

# Model-->

As word2vec has 300 dimensions do we are choosing output dimension as 300 units..

In [None]:
 model = Sequential()

#Non-trainable embeddidng layer
model.add(Embedding(vocab_size, output_dim=300, weights=[embedding_matrix], input_length=2000, trainable=False))
    
model.add(LSTM(units=128 , return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(units=64))
model.add(Dropout(0.1))
model.add(Dense(units = 32 , activation = 'relu'))
model.add(Dense(2, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train,Y_train,batch_size = 256 , validation_data = (X_test,Y_test) , epochs = 5)

In [None]:
# prediction = model.predict_classes(X_test)
# cf_matrix = confusion_matrix(Y_test,prediction)
# sns.heatmap(cf_matrix, annot=True, fmt='g', xticklabels = ['Fake','Real'] , yticklabels = ['Fake','Real'])

In [None]:
def get_pred_output(text_to_check):
    sequences = tokenizer.texts_to_sequences([text_to_check])
    data = pad_sequences(sequences, maxlen=2000)
    predicted_val = model.predict_classes(data)
#     predicted_val = model.predict(data)    
#     if predicted_val.max() > 0.7:
#         output = 1
#     else:
#          output = 0
    
    return predicted_val

In [None]:
unseen_real_data = """
Twenty-three more people have tested positive for COVID-19 in Tripura, taking the total number of cases in the state to 232.

The number of active cases stands at 65 while 165 people have recovered and have been discharged and two have migrated to other states.

Chief Minister Biplab Kumar Deb said, among the new cases, 18 people have come from Maharashtra by train.
"""

In [None]:
unseen_fake_data = """
Americans to fund killing babies in abortion that she has been caught trying to add taxpayer financing of abortions to the bill to combat the Coronavirus and provide economic stimulus to the nation as it deals with the COVD-19 outbreak.
Nancy Pelosi has a long history of promoting abortion and her first act after becoming Speaker in 2019 was pushing legislation to use tax money for abortions. So it’s no surprise she is trying to exploit the Coronavirus pandemic to push abortion funding again.
As The Daily Caller reports: House Speaker Nancy Pelosi sought to include a potential way to guarantee federal funding for abortion into the coronavirus economic stimulus plan, according to multiple senior White House officials.
Speaking to the Daily Caller, those officials alleged that while negotiating the stimulus with U.S. Treasury Secretary Steve Mnuchin, Pelosi tried to lobby for “several” provisions that stalled bipartisan commitment to the effort. One was a mandate for up to $1 billion to reimburse laboratory claims, which White House officials say would set a precedent of health spending without protections outlined in the Hyde Amendment.
LifeNews depends on the support of readers like you to combat the pro-abortion media. Please donate now.
“A New mandatory funding stream that does not have Hyde protections would be unprecedented,” one White House official explained. “Under the guise of protecting people, Speaker Pelosi is working to make sure taxpayer dollars are spent covering abortion—which is not only backwards, but goes against historical norms.”
A second White House official referred to the provision as a “slush fund” and yet another questioned “what the Hyde Amendment and abortion have to do with protecting Americans from coronavirus?”
Americans should insist to their members of Congress that we need a clean bill that provides aggressive action to help patients and spur the economy. Killing babies with our tax dollars is not the answer to the coronavirus and the situation should not be exploited for political gain.
"""

In [None]:
text_to_check = unseen_real_data
pred = get_pred_output(text_to_check)
print('Unseen real data prediction {} '.format(pred[0]))

text_to_check = unseen_fake_data
pred = get_pred_output(text_to_check)
print('Unseen fake data prediction {} '.format(pred[0]))

In [None]:
data.iloc[1000:1500]

In [None]:
data.iloc[31000:31500]

In [None]:
text_to_check = data.text[1500]
pred = get_pred_output(text_to_check)
print('Seen Real data prediction {} '.format(pred[0]))

text_to_check = data.text[31500]
pred = get_pred_output(text_to_check)
print('Seen Fake data prediction {} '.format(pred[0]))

# So our model is predicting quite well..

# Now saving our model as a h5 model..-->

In [None]:
model.save('final_lstm_model(word2vec).h5')