The dataset is a sentiment analysis for classifying fake and real news ,given a set of headings like text,title,subject,date .
This notebook gives a solution to identify fake and real news of a large data(set) with highest possible accuracy . the vocabulary of the dataset is over 150000 words, which is tried to reduce by using stop words and stemmer functions.comments are added wherever thought necessary, detailed explaination in the readme file 

In [None]:
#import all the essential libraries

import numpy as np 
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer  # for stemming

In [None]:
#import the csv into true and false

true = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
fake = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

In [None]:
#true.head shows the first five rows ,so the colum details can be studied,same is done with false (false.head)
true.head()

In [None]:
#add a column called category that assigns '0' for fake news and '1' for true news
true['category'] = 1
fake['category'] = 0

In [None]:
#true.news to check if the category column is added
true.head()

In [None]:
#concatenate true and false into dataframe 'df'
df = pd.concat([true,fake])

In [None]:
#display df
df


In [None]:
#set category column as the target and convert to a numpy array 
target = df['category'].values
target = np.array(target, dtype='int64')

In [None]:
#import seaborn for visualising the data 
import seaborn as sns
sns.countplot(x='category',data=df)

The data is fairly distributed b/w true and false , thus contributing to a non-biased model making

In [None]:
#observe which subjects/topics contribute to most news in the data
ax=sns.countplot(x='subject',data=df)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)


In [None]:
# we study which subjects/topics contribute to most fake and true news
sns.countplot(x='category',hue='subject',data=df)

***procedure to clean the dataset :** 
* Create a function clean_word that replaces punctuation with blank spaces, and appends only numbers and alphabets to 'new', excluding stopwords, and perform steeming and append the words to'new'.
* add all the columns to 'text' and drop the unnecessary columns
* find the largest sentence and pad rest of the sentences with zeros to match the largest sentence's length 
*  feed it into a vector which can be trained and tested for a model

In [None]:
#import stopwords, porter stemmer(for stemming)

STOPWORDS = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_word(word_list):
    global STOPWORDS
    global stemmer
    new = []
    for word in word_list:
        word = word.replace('.', '')
        word = word.replace(',', '')
        word = word.replace(';', '')
        word = word.lower()
        if (word.isalpha() or word.isdigit()) and word not in STOPWORDS:   
            new.append(stemmer.stem(word.strip()))
    return new

In [None]:
# add columns 'text', 'title' and 'subject' to text ,and leave spaces b/w so as to easily feed into the model
df['text'] = df['text'] + " " + df['title'] + " " + df['subject']
df = df.drop(columns=['title','subject','date']) #drop columns that might not contribute to the prediction of target

In [None]:
from sklearn.preprocessing import LabelEncoder

text = df['text'].values

# Tokenize each sentence 
text_arr = [row.split(' ') for row in text]

In [None]:

vocab = []
clean_text_array = []
for row in text_arr:
    clean_row = clean_word(row)
    clean_text_array.append(clean_row)
    vocab.extend(clean_row)

In [None]:
#set and list all the words to vocabulary and print the length(to get a count of the number of words)
vocabulary = list(set(vocab))
len(vocabulary)

In [None]:

vectorizer = LabelEncoder()
vectorizer.fit(vocabulary)
print(1)

# Create token vector using Label Encoder fit on entire vocabulary
token_vector = []
i=0

# declare max_words to keep count of the longest sentence vectorized
# we need this to pad every other vector to same length as longest vector

max_words = 0 
for row in clean_text_array:
    encoded = vectorizer.transform(row).tolist()
    size = len(encoded)
    if size>max_words: 
        max_words=size
    token_vector.append(encoded)

In [None]:
max_words #print max_words

In [None]:
# pad each sentence with zeros to the length of the longest sentence
padded = []
for row in token_vector:
    r = np.pad(row, (0, max_words-len(row)), 'constant')
    padded.append(r)

In [None]:
# all padded sentences to example vector

ex_vector = np.array(padded)

In [None]:
# split train and test data into 80:20, data=ex_vector, target=target

from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(ex_vector,target, test_size=0.2)

In [None]:
import tensorflow as tf

split the data into batches and shuffle 

In [None]:

train_dataset = tf.data.Dataset.from_tensor_slices((xtrain, ytrain))
test_dataset = tf.data.Dataset.from_tensor_slices((xtest, ytest))

In [None]:

BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 100

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

In [None]:
type(train_dataset)

In [None]:
from tensorflow import keras  


In [None]:
embedding_dim=16 

# defining the sequential model with an Embedding layer
# Add a Global Average Pooling 1D layer to flattent the matrix into vector

model = keras.models.Sequential([
  keras.layers.Embedding(130590, embedding_dim), #130590 as input based on vocabulary
  keras.layers.GlobalAveragePooling1D(),
  keras.layers.Dense(32, activation='relu'),
  keras.layers.Dense(1, activation='sigmoid')
])


In [None]:
#complie the model
model.compile(optimizer=keras.optimizers.Adam(0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [None]:
epochs=5                  #five iterations
history = model.fit(train_dataset, epochs=epochs, validation_data=test_dataset, verbose=2)

In [None]:
print("Accuracy of the model on Training Data is - " , model.evaluate(xtrain,ytrain)[1]*100)
print("Accuracy of the model on Testing Data is - " , model.evaluate(xtest,ytest)[1]*100)

In [None]:
# Plot Accuracy
plt.plot(range(epochs), history.history['accuracy'])
plt.plot(range(epochs), history.history['val_accuracy'])


In [None]:
# Plot Loss
plt.plot(range(epochs), history.history['loss'])
plt.plot(range(epochs), history.history['val_loss'])

In [None]:
pred = model.predict_classes(xtest)

In [None]:
cm = confusion_matrix(ytest,pred)
cm

In [None]:
cm = pd.DataFrame(cm , index = ['Fake','Not Fake'] , columns = ['Fake','Not Fake'])

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(cm,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Fake','Not Fake'] , yticklabels = ['Fake','Not Fake'])


In [None]:
kkkkkkkjjjjjj

In [None]:
just typing random stuff so the notebook doesnt collapse due to idleness
1 2 3 4 5 6 lkdaldlkaldkakdkdfffff
lkslkslakslksal
dlllllllllhhdkdkqhkdkhdqdqk hdqdhkdhnnnnn
sjffffffffffff
lsljslslxsxsaxkjd
kalkxslxljldddd
kLSLlsssmxxxx
cacaa