# Introduction and import main libraries

Hello everyone. I started learning  NLP from Lane Hobson's book "Natural Language Processing in Action", and "Deep Learning with Python [2017] Francois Chollet". So I decided to disassemble this dataset.

In [None]:
from wordcloud import WordCloud
from keras.models import Sequential
from keras import layers
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
import string
import seaborn as sns
import re
from nltk.corpus import stopwords
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
import plotly.express as px
from itertools import chain
from collections import Counter

In [None]:
fake=pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
real=pd.read_csv('../input/fake-and-real-news-dataset/True.csv')

In [None]:
fake['target']=0
real['target']=1
df=pd.concat([fake,real])
df.head()

In [None]:
df['original'] = df['text'] + ' ' + df['title']
df.head()

# Visualization part

Let's look at the distribution of our data, the missing values (there are no missing values), what the news outlets write about (nothing surprising), and the ratio of each subject category.

In [None]:
plt.figure(figsize=(15,7))
sns.set(style="darkgrid")
sns.countplot(df['target'])

In [None]:
plt.figure(figsize=(15,7))
sns.heatmap(df.isnull());

In [None]:
plt.figure(figsize=(15,7))
labels=['fake news','real news']
colors = ["SkyBlue","PeachPuff"]
plt.pie(df['target'].value_counts(),labels=labels,colors=colors,
        autopct='%1.2f%%', shadow=True, startangle=140) 
plt.show()

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(fake['subject'])

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(real['subject'])
plt.show()

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(df['subject'])

In [None]:
plt.figure(figsize=(15,15))
labels=['News','politics','Government News','left-news','US_News','Middle-east','politicsNews','worldnews']
colors = ["SkyBlue","PeachPuff",'tomato','gray','lightyellow','pink']
plt.pie(df['subject'].value_counts(),labels=labels,colors=colors,
        autopct='%1.f%%', shadow=True, startangle=140) 
plt.show()

In [None]:
wordcloud = WordCloud(background_color="black").generate(str(fake['title']))
plt.figure(figsize=(15,7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
wordcloud = WordCloud(background_color="black").generate(str(real['title']))
plt.figure(figsize=(15,7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

We will also analyze the time series: mark the date on the X-axis, and group the number of news items published on a particular day on the y-axis. In my opinion, most of the fakes were published during the 2016 election.

In [None]:
real=real.groupby(['date'])['target'].count()
real=pd.DataFrame(real)
fig = px.line(real)
fig.show()

In [None]:
fake=fake.groupby(['date'])['target'].count()
fake=pd.DataFrame(fake)
fig = px.line(fake)
fig.show()

# Data preprocessing

On the next step, we need to process the dataset. At this stage, first of all, you need to bring all the characters of the text to lowercase, remove punctuation marks, various non-letter characters and numbers

In [None]:
def data_preprocessing(data):
    ###Here we remove all trash in our text
    prep1 = data.lower()
    prep2 = re.sub('https?://\S+|www\.\S+', '', data)
    prep3 = re.sub('\\W', ' ', data)
    prep4 = re.sub('\n', '', data)
    prep5 = re.sub(' +', ' ', data)
    prep6 = re.sub('^ ', '', data)
    prep7 = re.sub(' $', '', data)
    return data

In [None]:
df['original'].apply(data_preprocessing)

Now I want to tell you about Zipf's law and why we need to remove stop words. Zipf's law is an empirical law of the frequency distribution of natural language words: if all the words of a language are ordered in descending order of their frequency of use, then the frequency of the nth word in such a list will be approximately inversely proportional to its ordinal number n. For example, the second-most-used word is about twice as common as the first, the third — three times less common than the first, and so on.

In [None]:
split_words = df['original'].str.split()
list_split_words = list(chain(*split_words))

count_word= Counter(list_split_words)
df_ = pd.DataFrame(data=count_word, index=['count'])
most_frequency_words = df_.T.sort_values(by=['count'], ascending=False).reset_index().head(50)

plt.figure(figsize=(15,10))
sns.set(style="darkgrid")
sns.barplot(x="index", y='count', data=most_frequency_words)
plt.xticks(rotation=90)

In [None]:
print(stopwords.words('english'))

In [None]:
stop = set(stopwords.words('english'))
def remove_stopwords(data):
    words = [word for word in data if word not in stop]
    words= "".join(words).split()
    words= [words.lower() for words in data.split()]
    return words    

In [None]:
df['original'].apply(remove_stopwords)

Usually, texts contain different grammatical forms of the same word or the same root words. Lemmatization is used to reduce the encountered word forms to one (normal form).

Lemmatization is a process that uses morphological analysis and vocabulary to reduce a word to its canonical form-a lemma.

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatization(text):
    lemmas = []
    for word in text.split():
        lemmas.append(lemmatizer.lemmatize(word))
    return " ".join(lemmas)

In [None]:
df['original'].apply(lemmatization)

In [None]:
x = df['original'].values
y= df['target'].values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)
word_to_index = tokenizer.word_index
x = tokenizer.texts_to_sequences(x)

vocab_size = len(word_to_index)+1
max_length = 10000
embedding_dim = 16
x = pad_sequences(x, maxlen=max_length)



In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,train_size=0.8,random_state=42)

# Building models 

As advised by François Chollet, I used convolutional neural networks for text recognition. The first model uses only them, and the second one looks more interesting: one of the strategies to combine the speed and ease of convolutional networks with the sensitivity to the order of recurrent networks is to use a one-dimensional convolutional network to pre-process data before transmitting it to the recurrent network. The convolutional part will turn a long input sequence into a shorter sequence of high-level features (reducing its time-solution). And then the sequence of selected features is fed to the input of the recurrent part of the network. This technique is not often found in scientific articles and practical applications, perhaps because it is little known. However, it has a fairly high efficiency and deserves a wider distribution.

Convolutional neural networks are used to expand the receptive field (the perception spot). That is, this is done in order to handle a broader context (longer patterns).

In [None]:
model=Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(1,activation='sigmoid'))
model.summary()
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])

In [None]:
history = model.fit(X_train,y_train,epochs=5,validation_data=(X_test,y_test),batch_size=128)

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1,len(acc)+1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

In [None]:
predicted_value = model.predict(X_test)
accuracy_value = roc_auc_score(y_test, predicted_value)
print(accuracy_value)

In [None]:
prediction = []
for i in range(len(predicted_value)):
    if predicted_value[i].item() > 0.5:
        prediction.append(1)
    else:
        prediction.append(0)
cm = confusion_matrix(list(y_test), prediction)
plt.figure(figsize = (10, 10))
sns.heatmap(cm, annot = True,fmt='g')

In [None]:
model_2 = Sequential()
model_2.add(layers.Embedding(vocab_size, embedding_dim, input_length=max_length))
model_2.add(layers.Conv1D(32, 5, activation='relu'))
model_2.add(layers.MaxPooling1D(3))
model_2.add(layers.Conv1D(32, 5, activation='relu'))
model_2.add(layers.LSTM(64))
model_2.add(layers.Dense(1,activation='sigmoid'))
model_2.summary()
model_2.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])

In [None]:
history_2 = model_2.fit(X_train,y_train,epochs=5,validation_data=(X_test,y_test),batch_size=128)

In [None]:
acc = history_2.history['acc']
val_acc = history_2.history['val_acc']
loss = history_2.history['loss']
val_loss = history_2.history['val_loss']

epochs = range(1,len(acc)+1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

In [None]:
predicted_value = model_2.predict(X_test)
accuracy_value = roc_auc_score(y_test, predicted_value)
print(accuracy_value)

In [None]:
prediction = []
for i in range(len(predicted_value)):
    if predicted_value[i].item() > 0.5:
        prediction.append(1)
    else:
        prediction.append(0)
cm = confusion_matrix(list(y_test), prediction)
plt.figure(figsize = (10, 10))
sns.heatmap(cm, annot = True,fmt='g')

If you liked this work, you can upvote. Advice for improvement are also welcome :)