<a href="https://colab.research.google.com/github/sania-azhmee22/CSE470/blob/main/CSE422_lab_project_fake_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive

In [None]:
dv=drive.mount("/content/drive")

Mounted at /content/drive


DATA PREPROCESSING

1. DATA LOADING

In [None]:
# data loading

import urllib
import json

def load_convert_data(url):
    """
    Downloads the json file from net and convert into pandas dataframe format.
    """
    with urllib.request.urlopen(url) as url:
        df = json.loads(url.read().decode())
        df = pd.DataFrame.from_dict(df)
        
    return df

Given:

There are 4 files:

1.training - set of real news.

2.testing - set of real news.

3.training - set of fake news.

4.testing - set of fake news

In [None]:
# Real news data
real_train = load_convert_data("https://storage.googleapis.com/public-resources/dataset/real_train.json")
real_test = load_convert_data("https://storage.googleapis.com/public-resources/dataset/real_test.json")

In [None]:
# Fake news data
fake_train = load_convert_data("https://storage.googleapis.com/public-resources/dataset/fake_train.json")
fake_test = load_convert_data("https://storage.googleapis.com/public-resources/dataset/fake_test.json")

In [None]:
real_train.head()

Unnamed: 0,url,title,text
0,https://www.thetimes.co.uk/edition/scotland/sc...,Scots GPs told not to meet fever patients as f...,Scots GPs told not to meet fever patients as f...
1,https://www.bbc.com/news/world-africa-52103799,Coronavirus : Fighting al - Shabab propaganda ...,Coronavirus: Fighting al-Shabab propaganda in ...
2,https://www.thetimes.co.uk/edition/business/en...,Engineer fears China virus impact,Engineer fears China virus impact\nA British e...
3,https://www.theguardian.com/world/live/2020/fe...,Coronavirus : South Korean PM vows swift act...,Here’s a summary of what’s happened so far on ...
4,https://yle.fi/uutiset/osasto/news/finnair_iss...,Finnair issues profit warning over Covid - 19 ...,Finnair issues profit warning over Covid-19 fe...


In [None]:
fake_train.head()

Unnamed: 0,url,title,text
0,https://nabd.com/s/71539812-b7228b/%D9%86%D8%B...,Online Facts New conspiracy theory: #Bel_Gates...,Roger Stone suggested on Monday that Bill Gate...
1,https://shamra.sy/news/article/8eb73454931e6d1...,Revolutionary Guards: Corona could be an Ameri...,Source\nRussia Today |\nIranian Revolutionary ...
2,https://sudanewsnow.com/19800/,Yellow skin is the host environment of the vir...,Sudan news now from all sources sudanewsnow.co...
3,https://arabic.rt.com/press/1100276-%D8%A7%D9%...,China and Russia are doing what the European U...,China and Russia are doing what the European U...
4,https://www.kachaf.com/details.php?n=5e8957fe1...,,Fatal error: Uncaught MongoDB\Driver\Exception...


2. DATA PREPROCESSING BY NLP

In [None]:
real_train['label'] = 0
real_test['label'] = 0
fake_train['label'] = 1
fake_test['label'] = 1

train = pd.concat([real_train, fake_train], ignore_index=True)
test = pd.concat([real_test, fake_test], ignore_index=True)

In [None]:
import re
def clean_txt(text):
    text = re.sub("'", "", text)
    text = re.sub("(\\W)+", " ", text)
    text = text.lower()
    return text

train['text'] = train['text'].apply(clean_txt)
test['text'] = test['text'].apply(clean_txt)

**PLOTTING DATA**

Word Count histogram

In [None]:
train['word_count'] = [len(s.split()) for s in train['text']]
#real 
sns.distplot(train['word_count'][train['label'] == 0], kde=False, rug=False)

In [None]:
#fake 
sns.distplot(train['word_count'][train['label'] == 1], kde=False, rug=False)

In [None]:
sns.distplot(train['word_count'][(train['label'] == 1) & (train['word_count'] < 20000)], kde=False, rug=False)

In [None]:
from wordcloud import WordCloud

def plot_wordcloud(target,width = 800, height = 400):
    """
    Plot wordcloud of real/fake news
    
    target: real/fake
    width: the width of plotted figure
    height: the height of plotted figure
    """
    if target == 'real':
        t = 0
    elif target == 'fake':
        t = 1
    text = ''
    for t in train['text'][train['label'] == t]:
        text = text + t
    wordcloud = WordCloud(max_font_size=40, min_font_size=20, width=800, height = 400, random_state=0).generate(text)
    plt.figure(figsize=(20,10))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

In [None]:
plot_wordcloud('real',width = 800, height = 400)

In [None]:
plot_wordcloud('fake',width = 800, height = 400)

In [None]:
# how many words in top 10, top 100, and top 1000
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

def concat_text(target):
    """
    Concat the news into one large document and split it into a list.
    """
    if target == 'real':
        t = 0
    elif target == 'fake':
        t = 1
        
    text = ''
    for t in train['text'][train['label'] == t]:
        text = text + t
    text = text.split(' ')

    return text

def most_frequent_words(text):
      """
      Calculate and order the vocab by its frequency.
      """
      ngram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), min_df=1)
      X = ngram_vectorizer.fit_transform(text)
      vocab = np.array(list(ngram_vectorizer.get_feature_names()))
      counts = np.array(X.sum(axis=0).A1)
      inds = counts.argsort()[::-1]
      ordered_vocab = vocab[inds]

      return ordered_vocab

In [None]:
def plot_topK_distribution(k1 = 10, k2 = 100, k3 = 1000):
    """
    Plot the comparison bar chart between real and fake news.

    k1: most common k1 words
    k2: most common k2 words
    k3: most common k3 words
    """
    real_text = concat_text('real')
    fake_text = concat_text('fake')

    real_vocab = most_frequent_words(real_text)
    fake_vocab = most_frequent_words(fake_text)
    
    x = ['top' + str(k1), 'top' + str(k2), 'top' + str(k3)] 
    label = ['real','real','real','fake','fake','fake']
    y = [np.mean([s in real_vocab[1:k1] for s in real_text]),
         np.mean([s in real_vocab[1:k2] for s in real_text]),
         np.mean([s in real_vocab[1:k3] for s in real_text]),
         np.mean([s in fake_vocab[1:k1] for s in fake_text]),
         np.mean([s in fake_vocab[1:k2] for s in fake_text]),
         np.mean([s in fake_vocab[1:k3] for s in fake_text])]

    df = pd.DataFrame(zip(x*2, label, y), columns=["Topk", "Label", "Proportion"])
    sns.barplot(x="Topk", hue="Label", y="Proportion", data=df)
    plt.show()

plot_topK_distribution(k1 = 10, k2 = 100, k3 = 1000)

V modelling

In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(train, test_size=0.2, random_state=35)

In [None]:
def get_split(text):
    """
    Split each news text to subtexts no longer than 150 words.
    """
    l_total = []
    l_parcial = []
    if len(text.split())//120 >0:
        n = len(text.split())//120
    else: 
        n = 1
    for w in range(n):
        if w == 0:
            l_parcial = text.split()[:150]
            l_total.append(" ".join(l_parcial))
        else:
            l_parcial = text.split()[w*120:w*120 + 150]
            l_total.append(" ".join(l_parcial))
    return l_total

train['text_split'] = train['text'].apply(get_split)
val['text_split'] = val['text'].apply(get_split)
test['text_split'] = test['text'].apply(get_split)

In [None]:
train['text_split'][1]

In [None]:
def data_augumentation(df, df_name):
    """
    Create a new dataframe from the original one because now one text may contain multiple subtexts of length 200. 
    Text correspond to subtexts from original text, while index correspond to its index of original set.
    """
    text_l = []
    label_l = []
    index_l = []
    for idx,row in df.iterrows():
      for l in row['text_split']:
        text_l.append(l)
        label_l.append(row['label'])
        index_l.append(idx)
    new_df = pd.DataFrame({'text':text_l, 'label':label_l, 'index':index_l})
    print("The " + df_name +" set now has " + str(len(new_df)) + ' subtexts extracted from ' + str(len(df)) + ' texts.')
    return new_df

train_df = data_augumentation(train, df_name = 'training')
val_df = data_augumentation(val, df_name  = 'validation')
test_df = data_augumentation(test, df_name = 'testing')

Model Application

DATA PREPROCESSING

In [None]:
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
import nltk

nltk.download("stopwords")
nltk.download("punkt")

In [None]:
from google.colab import files
data_to_load = files.upload()

In [None]:
df_fake=pd.read_csv("Fake.csv")
df_true=pd.read_csv("True.csv")

In [None]:
df_fake["class"]=0
df_true["class"]=1

In [None]:
df_fake.shape, df_true.shape

In [None]:
df_fake_manual_testing=df_fake.tail(10)
for i in range(23480,23470,-1):
    df_fake.drop([i],axis=0, inplace=True)
    df_true_manual_testing=df_true.tail(10)
for i in range(21416,21406, -1):
    df_true.drop([i], axis=0, inplace=True)

In [None]:
df_manual_testing=pd.concat([df_fake_manual_testing, df_true_manual_testing],axis=0)
df_manual_testing.to_csv("manual_testing.csv")

In [None]:
df_merge=pd.concat([df_fake, df_true], axis=0)

In [None]:
df=df_merge.drop(["title", "subject", "date"], axis=1)

In [None]:
df=df.sample(frac=1)

In [None]:
df.isnull().sum()

In [None]:
def word_drop(text):
    text=text.lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub("\\W"," ",text)
    text=re.sub('https?://\S+|www\.\S+', '',text)
    text=re.sub('<.?>+', '',text)
    text=re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text=re.sub('\n', '',text)
    text=re.sub('\w*\d\w*','',text)
    return text

In [None]:
df["text"]=df["text"].apply(word_drop)

In [None]:
x=df["text"]
y=df["class"]

In [None]:
x_train, x_test, y_train,y_test=train_test_split(x,y,test_size=.25)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorization=TfidfVectorizer()
xv_train=vectorization.fit_transform(x_train)
xv_test=vectorization.transform(x_test)

LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
LR=LogisticRegression()
LR.fit(xv_train,y_train)

In [None]:
LR.score(xv_test,y_test)

In [None]:
pred_LR=LR.predict(xv_test)

In [None]:
print(classification_report(y_test,pred_LR))

Decision Tree Classifcication

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
DT=DecisionTreeClassifier()
DT.fit(xv_train,y_train)

In [None]:
DT.score(xv_test,y_test)

In [None]:
pred_DT=DT.predict(xv_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,pred_DT))

Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RFC=RandomForestClassifier(random_state=0)

In [None]:
RFC=RandomForestClassifier(random_state=0)
RFC.fit(xv_train,y_train)

In [None]:
RFC.score(xv_test,y_test)

In [None]:
pred_RFC=RFC.predict(xv_test)

In [None]:
print(classification_report(y_test,pred_RFC))