In [1]:
from google.colab import drive
drive.mount("/drive", force_remount=True)

Mounted at /drive


In [1]:
import os
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

### Load data

In [2]:
real_news_df = pd.read_csv('/drive/My Drive/Colab Notebooks/IST 664/2023/True.csv')
fake_news_df = pd.read_csv('/drive/My Drive/Colab Notebooks/IST 664/2023/Fake.csv')

# My local loading
# real_news_df = pd.read_csv('News_dataset/True.csv')
# fake_news_df = pd.read_csv('News_dataset/Fake.csv')

### Preprocess

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /Users/pranav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/pranav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pranav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Init lemmatizer
lemmatizer = WordNetLemmatizer()

Data Pre-processing:

- Lowercasing
  - Ensures that the same word in different cases is not considered as two different words, reducing data redundancy.

- Tokenization:
  - Allows us to analyze the text at the word level, which is crucial for most NLP tasks.

- Stop word removal:
  - Helps us focus on the important words by removing common words with little semantic value.
  - Reduces the size of the data, making it more manageable.

- Lemmatization:
  - Reduces words to their base form, decreasing data dimensionality.
  - Helps in treating different forms of a word as a single entity, which is important for understanding the semantic meaning of the text.

In [5]:
def preprocess_text(text):

    text = text.lower()

    words = word_tokenize(text) # Tokenize the text

    words = [word for word in words if word not in stopwords.words('english')] # Remove stop words

    words = [lemmatizer.lemmatize(word) for word in words] # Lemmatize the words

    text = ' '.join(words) # Join the words back into a single string

    return text

In [6]:
# Save the original text before it processed
real_news_df['original_text'] = real_news_df['text'].copy()
fake_news_df['original_text'] = fake_news_df['text'].copy()


In [7]:
%%time
# Takes 14 min to complete
real_news_df['text'] = real_news_df['text'].progress_apply(preprocess_text)
fake_news_df['text'] = fake_news_df['text'].progress_apply(preprocess_text)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 21417/21417 [06:39<00:00, 53.57it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 23481/23481 [07:46<00:00, 50.30it/s]

CPU times: user 10min 55s, sys: 3min 27s, total: 14min 22s
Wall time: 14min 26s





### Analysis

In [8]:
import re
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk import FreqDist, bigrams
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

In [9]:
# Calculate word frequencies
def word_frequencies(texts, no_stopwords_flag=False):
    words = nltk.word_tokenize(' '.join(texts))
    if no_stopwords_flag:
        words = [word for word in words if word not in stopwords.words('english')]
    elif not no_stopwords_flag:
        words = [word for word in words if word in stopwords.words('english')]
    return FreqDist(words)

In [10]:
# Calculate bigram frequencies
def bigram_frequencies(texts):
    words = nltk.word_tokenize(' '.join(texts))
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(words)
    return finder.ngram_fd.items()

In [11]:
# Calculate Mutual Information scores
def mutual_info_scores(texts, min_freq=5):
    words = nltk.word_tokenize(' '.join(texts))
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(words)
    finder.apply_freq_filter(min_freq)
    return finder.score_ngrams(bigram_measures.pmi)


In [12]:
%%time
# Get the top 50 stop words by frequency in fake and real news articles
fake_news_stopwords = word_frequencies(fake_news_df['text'].tolist(), no_stopwords_flag=False).most_common(50)
real_news_stopwords = word_frequencies(real_news_df['text'].tolist(), no_stopwords_flag=False).most_common(50)

# Get the top 50 content words by frequency in fake and real news articles
fake_news_content_words = word_frequencies(fake_news_df['text'].tolist(), no_stopwords_flag=True).most_common(50)
real_news_content_words = word_frequencies(fake_news_df['text'].tolist(), no_stopwords_flag=True).most_common(50)


CPU times: user 13min 34s, sys: 4min 17s, total: 17min 51s
Wall time: 18min


In [13]:
%%time
# Get the top 50 bigrams by frequencies in fake and real news articles
fake_news_bigrams = bigram_frequencies(fake_news_df['text'].tolist())
fake_news_bigrams_top50 = FreqDist(fake_news_bigrams).most_common(50)
real_news_bigrams = bigram_frequencies(fake_news_df['text'].tolist())
real_news_bigrams_top50 = FreqDist(real_news_bigrams).most_common(50)


CPU times: user 49.8 s, sys: 457 ms, total: 50.3 s
Wall time: 50.4 s


In [14]:
%%time
# Get the top 50 bigrams by their Mutual Information scores in fake and real news articles
fake_news_bigrams_mi = mutual_info_scores(fake_news_df['text'].tolist())
fake_news_bigrams_mi_top50 = FreqDist(fake_news_bigrams_mi).most_common(50)

real_news_bigrams_mi = mutual_info_scores(fake_news_df['text'].tolist())
real_news_bigrams_mi_top50 = FreqDist(real_news_bigrams_mi).most_common(50)


CPU times: user 49.7 s, sys: 331 ms, total: 50 s
Wall time: 50.1 s


In [15]:
%%time
# Get the top 50 adjective words in fake and real news articles
fake_news_adjectives = [word for word, pos in pos_tag(nltk.word_tokenize(' '.join(fake_news_df['text'].tolist()))) if pos.startswith('JJ')]
real_news_adjectives = [word for word, pos in pos_tag(nltk.word_tokenize(' '.join(fake_news_df['text'].tolist()))) if pos.startswith('JJ')]
fake_news_adjectives = FreqDist(fake_news_adjectives).most_common(50)
real_news_adjectives = FreqDist(real_news_adjectives).most_common(50)


CPU times: user 5min 9s, sys: 2.07 s, total: 5min 11s
Wall time: 5min 12s


In [16]:
# Print the results
print('Top 50 stop words in fake news:\n', fake_news_stopwords)
print('\nTop 50 stop words in real news:\n', real_news_stopwords)

print("\n\n######################\n\n")

print('Top 50 content words in fake news:\n', fake_news_content_words)
print('Top 50 content words in real news:\n', real_news_content_words)

print("\n\n######################\n\n")

print('Top 50 bigrams in fake news:\n', fake_news_bigrams_top50)
print('\nTop 50 bigrams in real news:\n', real_news_bigrams_top50)

print("\n\n######################\n\n")

print('Top 50 bigrams by MI in fake news:\n', fake_news_bigrams_mi_top50)
print('\nTop 50 bigrams by MI in real news:\n', real_news_bigrams_mi_top50)

print("\n\n######################\n\n")

print('Top 50 adjectives in fake news:\n', fake_news_adjectives)
print('\nTop 50 adjectives in real news:\n', real_news_adjectives)


Top 50 stop words in fake news:
 [('as', 507), ('being', 187), ('down', 86), ('be', 64), ('m', 56), ('d', 33), ('re', 32), ('s', 31), ('haven', 28), ('can', 25), ('ma', 18), ('in', 14), ('so', 13), ('out', 11), ('t', 10), ('will', 8), ('do', 8), ('it', 8), ('more', 8), ('he', 8), ('there', 6), ('o', 5), ('have', 5), ('this', 4), ('that', 4), ('you', 3), ('no', 3), ('now', 2), ('an', 2), ('here', 2), ('yourself', 2), ('on', 2), ('all', 1), ('himself', 1), ('ours', 1), ('below', 1), ('is', 1), ('while', 1), ('theirs', 1), ('don', 1), ('them', 1), ('him', 1), ('i', 1)]

Top 50 stop words in real news:
 [('it', 208), ('that', 118), ('them', 83), ('do', 69), ('me', 66), ('out', 64), ('now', 63), ('him', 59), ('haven', 57), ('again', 55), ('here', 47), ('this', 42), ('up', 38), ('on', 36), ('you', 34), ('be', 33), ('ma', 33), ('no', 30), ('is', 30), ('will', 28), ('not', 27), ('there', 27), ('all', 27), ('down', 26), ('about', 25), ('so', 24), ('m', 24), ('in', 22), ('for', 20), ('to', 18), 

### Further Analysis

In [17]:
# Calculate the number of capitalized words
def count_capitalized(text):
    return len(re.findall(r'\b[A-Z]{2,}\b', text))


In [18]:
# Calculate the number of exclamation marks
def count_exclamation_marks(text):
    return text.count('!')


In [19]:
# Calculate the number of punctuation marks
def count_punctuation(text):
    return len([char for char in text if char in string.punctuation])


In [21]:
# Apply the functions to the 'processed_text'
real_news_df['word_count'] = real_news_df['text'].progress_apply(lambda x: len(nltk.word_tokenize(x)))
fake_news_df['word_count'] = fake_news_df['text'].progress_apply(lambda x: len(nltk.word_tokenize(x)))

real_news_df['content_word_count'] = real_news_df['text'].progress_apply(lambda x: len([word for word in nltk.word_tokenize(x) if word not in stopwords.words('english')]))
fake_news_df['content_word_count'] = fake_news_df['text'].progress_apply(lambda x: len([word for word in nltk.word_tokenize(x) if word not in stopwords.words('english')]))

real_news_df['capitalized_word_count'] = real_news_df['text'].progress_apply(count_capitalized)
fake_news_df['capitalized_word_count'] = fake_news_df['text'].apply(count_capitalized)

real_news_df['exclamation_mark_count'] = real_news_df['text'].progress_apply(count_exclamation_marks)
fake_news_df['exclamation_mark_count'] = fake_news_df['text'].progress_apply(count_exclamation_marks)

real_news_df['punctuation_count'] = real_news_df['text'].progress_apply(count_punctuation)
fake_news_df['punctuation_count'] = fake_news_df['text'].progress_apply(count_punctuation)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 21417/21417 [00:17<00:00, 1257.49it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 23481/23481 [00:18<00:00, 1286.65it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 21417/21417 [04:20<00:00, 82.06it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 23481/23481 [04:47<00:00, 81.80it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 21417/21417 [00:00<00:00, 49600.60it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 21417/21417 [00:00<00:00, 867866.68it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████

In [22]:
# Save the analysis
real_news_df.to_csv('./Shreya_Zope_real_news_processed.csv', index=False)
fake_news_df.to_csv('./Shreya_Zope_fake_news_processed.csv', index=False)


### Compare Fake News vs Real News stats

Interpretation of Analysis Results:

- Word Count: Both real and fake news articles have similar average word counts, suggesting similar lengths of articles.

- Content Word Count: The average content word counts are also similar, indicating comparable amounts of meaningful content in both types of articles.

- Capitalized Word Count: The average count of capitalized words is zero for both, suggesting that capitalized words are not commonly used in either type of article.

- Exclamation Mark Count: Fake news articles have a significantly higher average count of exclamation marks, indicating a more sensational or dramatic writing style.

- Punctuation Count: Fake news articles have a higher average count of punctuation marks, suggesting potentially more complex sentences or a more sensational writing style.


In [23]:
# Calculate the averages of the statistics for the real and fake news articles
real_news_avg_word_count = real_news_df['word_count'].mean()
fake_news_avg_word_count = fake_news_df['word_count'].mean()

real_news_avg_content_word_count = real_news_df['content_word_count'].mean()
fake_news_avg_content_word_count = fake_news_df['content_word_count'].mean()

real_news_avg_capitalized_word_count = real_news_df['capitalized_word_count'].mean()
fake_news_avg_capitalized_word_count = fake_news_df['capitalized_word_count'].mean()

real_news_avg_exclamation_mark_count = real_news_df['exclamation_mark_count'].mean()
fake_news_avg_exclamation_mark_count = fake_news_df['exclamation_mark_count'].mean()

real_news_avg_punctuation_count = real_news_df['punctuation_count'].mean()
fake_news_avg_punctuation_count = fake_news_df['punctuation_count'].mean()


In [24]:
# Print the averages
print('Real news average word count:', real_news_avg_word_count)
print('Fake news average word count:', fake_news_avg_word_count)

print('Real news average content word count:', real_news_avg_content_word_count)
print('Fake news average content word count:', fake_news_avg_content_word_count)

print('Real news average capitalized word count:', real_news_avg_capitalized_word_count)
print('Fake news average capitalized word count:', fake_news_avg_capitalized_word_count)

print('Real news average exclamation mark count:', real_news_avg_exclamation_mark_count)
print('Fake news average exclamation mark count:', fake_news_avg_exclamation_mark_count)

print('Real news average punctuation count:', real_news_avg_punctuation_count)
print('Fake news average punctuation count:', fake_news_avg_punctuation_count)


Real news average word count: 279.3409908016996
Fake news average word count: 281.68106128359096
Real news average content word count: 279.25666526591027
Fake news average content word count: 281.629700609003
Real news average capitalized word count: 0.0
Fake news average capitalized word count: 0.0
Real news average exclamation mark count: 0.06191343325395714
Fake news average exclamation mark count: 0.7229675056428602
Real news average punctuation count: 50.61782695989167
Fake news average punctuation count: 59.23789446786764
