In [1]:

from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import matplotlib_inline
import re
import math
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import nltk
from nltk.corpus import stopwords
import os

# Cleaning the dataset
Cleaning the content column, tokenizing, stemming.
Dropping empty rows, unnecessary columns. 

In [2]:
filename = '995,000_rows.csv'
df = pd.read_csv(filename, usecols=['content', 'type'])

In [3]:
def clean_text(text):
    """Redacts URLs, dates, email addresses and numbers in a given text input, as well as converting text to lower case and removing tabs, newlines, and spaces following other spaces"""
    text = str(text)
    date_exp =  {
                "year_mm_dd" : re.compile(r'[^\d]{1}([0-9]{2,4})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),   
                "dd_mm_year" : re.compile(r'[^\d]{1}([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),
                "mm_dd_year" : re.compile(r'[^\d]{1}([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),
                "year_mm_dd_time" : re.compile(r'[^\d]{1}([0-9]{2,4})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),
                "dd_mm_year_time" : re.compile(r'[^\d]{1}([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),
                "mm_dd_year_time" : re.compile(r'[^\d]{1}([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2}\.[\d]{6})?', re.MULTILINE),
                "year_mm_dd_hh_mm" : re.compile(r'[^\d]{1}([0-9]{2,4})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})\s?([\d]{2}:[\d]{2})', re.MULTILINE),
                "dd_mm_year_hh_mm" : re.compile(r'[^\d]{1}([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2})', re.MULTILINE),
                "mm_dd_year_hh_mm" : re.compile(r'[^\d]{1}([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2})', re.MULTILINE),
                "year_mm_dd_hh_mm_ss" : re.compile(r'[^\d]{1}([0-9]{2,4})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})\s?([\d]{2}:[\d]{2}:[\d]{2})', re.MULTILINE),
                "dd_mm_year_hh_mm_ss" : re.compile(r'[^\d]{1}([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2})', re.MULTILINE),
                "mm_dd_year_hh_mm_ss" : re.compile(r'[^\d]{1}([0-1]{1}[0-9]{1})[\s\/\.\-\\]?([0-3]{1}[0-9]{1})[\s\/\.\-\\]?([0-9]{2,4})\s?([\d]{2}:[\d]{2}:[\d]{2})', re.MULTILINE),
                }
    num_exp = re.compile('[0-9]+[,.]?[0-9]*', re.MULTILINE)
    num2_exp = re.compile(r'([0-9]+)((st)?(nd)?(rd)?(th)?(st)?){1}')
    url_exp = re.compile(r'((h{1}t{2}p{1}s?\:{1}\/{2})|(w{3}\.{1})){0,2}[^,\s]*\.[a-zA-Z]{2,}[^,\s]*', re.MULTILINE)
    email_exp = re.compile(r'[^,\s\/]*@{1}[^,\s\/]*\.[a-zA-Z]{2,3}', re.MULTILINE)
    space_exp = re.compile(r'([\s]{2,})|[\t]|[\n]+', re.MULTILINE)
    punctuation_exp = re.compile(r'[^\w\s]', re.MULTILINE)

    text = text.lower()
    for exp in date_exp.values():
        text = exp.sub('datetoken', text) #Replace dates before numbers
    text = num2_exp.sub('numtoken', text)
    text = num_exp.sub('numtoken', text)
    text = url_exp.sub('urltoken', text)
    text = email_exp.sub('emailtoken', text)
    text = space_exp.sub(' ', text)
    text = punctuation_exp.sub(' ', text)
    return text

def clean_text_series(series):
    return series.apply(clean_text)

In [None]:
def df_chunker(df, chunksize):
    list_df = np.array_split(df, math.ceil(len(df) / chunksize))
    del df
    j = 1
    for df in list_df:
        print(f"Processing chunk {j} of {len(list_df)}:")
        # Check if chunk parquet already exists
        try:
            pq.read_table(f"chunk_{j}.parquet").to_pandas()
            print(f"Chunk {j} already processed!", flush=True)
            chunk = pd.read_parquet(f"chunk_{j}.parquet")
            j += 1
            continue
        except:
            pass

        print("Cleaning content...", flush=True)
        df['content'] = clean_text_series(df['content'])
        print(" ", end="\r", flush=True)

        print("Calculating features...", end="\r", flush=True)
        df['length'] = df['content'].apply(len)
        df['distinct_words'] = df['content'].apply(lambda x: set(x.split()))
        df['length_distinct_words'] = df['distinct_words'].apply(len)
        df['group'] = df['type'].apply(lambda x: 1 if x in ['fake', 'satire', 'bias', 'conspiracy', 'junksci', 'hate'] else 0)

        def f_NUM(x):
            count = 0
            for word in x.split():
                if word == 'numtoken':
                    count += 1
            return count

        def f_URL(x):
            count = 0
            for word in x.split():
                if word == 'urltoken':
                    count += 1
            return count
            
        def f_EMAIL(x):
            count = 0
            for word in x.split():
                if word == 'emailtoken':
                    count += 1
            return count

        def f_DATE(x):
            count = 0
            for word in x.split():
                if word == 'datetoken':
                    count += 1
            return count

        df['numtokens'] = df['content'].apply(f_NUM)
        df['urltokens'] = df['content'].apply(f_URL)
        df['emailtokens'] = df['content'].apply(f_EMAIL)
        df['datetokens'] = df['content'].apply(f_DATE)

        print("\n", end="\r", flush=True)

        print("Saving chunk...", end="\r", flush=True)
        df.to_parquet(f"chunk_{j}.parquet")
        print(f"Chunk {j} done!", flush=True)
        
        j += 1
    df = pd.concat([pd.read_parquet(f"chunk_{j}.parquet") for j in range(1, len(list_df) + 1)])
    del list_df
    return df

In [5]:
df = df_chunker(df, 50000)

  return bound(*args, **kwds)


Processing chunk 1 of 20:
Cleaning content...
Calculating features...
Chunk 1 done!..
Processing chunk 2 of 20:
Cleaning content...
Calculating features...
Chunk 2 done!..
Processing chunk 3 of 20:
Cleaning content...
Calculating features...
Chunk 3 done!..
Processing chunk 4 of 20:
Cleaning content...
Calculating features...
Chunk 4 done!..
Processing chunk 5 of 20:
Cleaning content...
Calculating features...
Chunk 5 done!..
Processing chunk 6 of 20:
Cleaning content...
Calculating features...
Chunk 6 done!..
Processing chunk 7 of 20:
Cleaning content...
Calculating features...
Chunk 7 done!..
Processing chunk 8 of 20:
Cleaning content...
Calculating features...
Chunk 8 done!..
Processing chunk 9 of 20:
Cleaning content...
Calculating features...
Chunk 9 done!..
Processing chunk 10 of 20:
Cleaning content...
Calculating features...
Chunk 10 done!.
Processing chunk 11 of 20:
Cleaning content...
Calculating features...
Chunk 11 done!.
Processing chunk 12 of 20:
Cleaning content...
Calcu

In [14]:
df.drop(['content'], axis= 1)

Unnamed: 0,length,group,numtokens,urltokens,emailtokens,datetokens
0,2132,0,2,0,0,0
1,997,1,4,2,0,0
2,189,1,2,1,0,0
3,4903,0,6,2,0,0
4,36816,1,960,3,0,1
...,...,...,...,...,...,...
994995,171,1,0,0,0,0
994996,2132,0,2,0,0,0
994997,6445,0,27,5,0,0
994998,121,0,0,0,0,0


In [71]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test  = train_test_split(df['content'], df['group'], test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)



In [69]:
x_test

577163    to the editor  re  foreign stimulus   op ed  s...
925049    plus one article on google plus  thanks to ali...
903173    mr  morris  who is now numtoken  says his lush...
610830    hideous absinthe a history of the devil in a b...
219994    gary cahill could leave chelsea at the end of ...
                                ...                        
685965    it started out as a love song  alanis morisset...
981068    royal oil and vinegar bottle set with stainles...
214052    there have been many times where i have gone o...
392662    yesterday  i received a threatening letter fro...
15486     a variety of executive orders have been signed...
Name: content, Length: 199000, dtype: object

In [None]:


# Convert the text to a bag-of-words representation
vectorizer = CountVectorizer(analyzer='word', stop_words='english')
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
x_val = vectorizer.transform(x_val) 



In [73]:
# Train a logistic regression classifier
classifier = LogisticRegression()
classifier.fit(x_train, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [74]:
# Make predictions on the test set
y_pred = classifier.predict(x_val)



In [77]:

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.8514472361809046


In [None]:
df = df[df['content'].apply(len) > 0]

In [None]:
word_counts = {}
for content in df['content']:
    for word in content:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1

top_10k = sorted(word_counts, key=word_counts.get, reverse=True)[:10000]

#df['content_filtered'] = df['content'].apply(lambda x: [word for word in x if word in top_10k])

# Visualizations
## Articles of each type

In [None]:
fig1 = plt.figure()
ax1 = fig1.add_subplot(111)
ax1.bar(df["type"].value_counts().index, df["type"].value_counts().values)
ax1.set_title(f'Article types')
ax1.set_xlabel('Types')
plt.xticks(rotation=45)
ax1.set_ylabel('# of articles')
fig1.savefig(f'data\\articles_of_each_type.png')
fig1.show()

## Text summaries

In [None]:
for type in df['type'].unique():
    name = f'{type}_df'
    globals()[name] = df[df['type'] == type]

    print(f'Articles of type {type}: {len(globals()[name])}')

    print(f'{type} mean length: {globals()[name]["length"].mean()}')
    print(f'{type} median length: {globals()[name]["length"].median()}')

    print(f'{type} mean distinct words: {globals()[name]["length_distinct_words"].mean()}')
    print(f'{type} median distinct words: {globals()[name]["length_distinct_words"].median()}')

    print('\n')


## Article lengths vs. number of distinct words

In [None]:
# Calculate global min and max for length and length_distinct_words
global_min_length = df['length'].min()
global_max_length = df['length'].max()
global_min_length_distinct_words = df['length_distinct_words'].min()
global_max_length_distinct_words = df['length_distinct_words'].max()

# Main plot with all data points
fig1, ax1 = plt.subplots(figsize=(10, 10))

# Use a color map to automatically assign colors
color_map = plt.get_cmap('tab10')

# Plot each type with a different color and set the size of the dots
dot_size = 1  # Adjust this value to change the size of the dots
for i, article_type in enumerate(df['type'].unique()):
    type_df = df[df['type'] == article_type]
    ax1.scatter(type_df['length_distinct_words'], type_df['length'], color=color_map(i), label=article_type, s=dot_size)

ax1.set_title('Article length vs number of distinct words')
ax1.set_xlabel('Number of distinct words')
ax1.set_ylabel('Article length')
ax1.legend()
ax1.set_xlim(global_min_length_distinct_words, global_max_length_distinct_words)
ax1.set_ylim(global_min_length, global_max_length)

# Create a new figure for the grid of subplots
num_types = len(df['type'].unique())
ncols = 2
nrows = (num_types + 1) // ncols  # Calculate the number of rows needed

fig2, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(10, 5 * nrows))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Plot each type in a separate subplot
for i, article_type in enumerate(df['type'].unique()):
    type_df = df[df['type'] == article_type]
    ax = axes[i]
    ax.scatter(type_df['length_distinct_words'], type_df['length'], color=color_map(i), label=article_type, s=dot_size)
    ax.set_title(f'{article_type}')
    ax.set_xlabel('Number of distinct words')
    ax.set_ylabel('Article length')
    ax.legend()
    ax.set_xlim(global_min_length_distinct_words, global_max_length_distinct_words)
    ax.set_ylim(global_min_length, global_max_length)

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig2.delaxes(axes[j])

# Adjust layout to prevent overlap
plt.tight_layout()

# Save the figures
fig1.savefig('data/article_length_vs_distinct_words.png')
fig2.savefig('data/article_length_vs_distinct_words_by_type.png')
plt.show()

## Distributions of article lengths pr. type

In [None]:
# Create the directory if it does not exist
output_dir = 'data/article_length_distributions'
os.makedirs(output_dir, exist_ok=True)

# Calculate global min and max for length and log(length)
global_min_length = df['length'].min()
global_max_length = df['length'].max()
global_min_log_length = np.log(df['length']).min()
global_max_log_length = np.log(df['length']).max()

for article_type in df['type'].unique():
    fig1 = plt.figure(figsize=(10, 5))
    # Adding two subplots side by side
    ax1 = fig1.add_subplot(1, 2, 1)
    ax2 = fig1.add_subplot(1, 2, 2)
    
    type_df = df[df['type'] == article_type]
    
    # The first subplot shows the distribution of the length of the articles
    ax1.hist(type_df['length'], bins=100)
    ax1.set_title(f'{article_type} article length distribution')
    ax1.set_xlabel('Article length')
    ax1.set_ylabel('# of articles')
    ax1.set_xlim(global_min_length, global_max_length)  # Set uniform x-axis limits

    # The second subplot shows the distribution of the log of the length of the articles to better visualize the distribution
    ax2.hist(np.log(type_df['length']), bins=100)
    ax2.set_title(f'{article_type} article length distribution (log scale)')
    ax2.set_xlabel('log(Article length)')
    ax2.set_ylabel('# of articles')
    ax2.set_xlim(global_min_log_length, global_max_log_length)  # Set uniform x-axis limits

    fig1.savefig(f'{output_dir}/{article_type}_article_length_distribution.png')
    plt.show()

## Vocabulary

In [None]:
word_counts = {}
for content in df['content']:
    for word in content:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1

In [None]:
for df in [bias_df, clickbait_df, conspiracy_df, fake_df, hate_df, junksci_df, political_df, reliable_df, rumor_df, satire_df, unknown_df, unreliable_df]:
    type = df['type'].unique()[0]
    name = f'{type}_wordlist'
    word_counts = {}
    for content in df['content']:
        for word in content:
            if word in word_counts:
                word_counts[word] += 1
            else:
                word_counts[word] = 1
    globals()[name] = word_counts

## Most common words by type

In [None]:
fig1 = plt.figure(figsize=(20, 5))
ax1 = fig1.add_subplot(111)
ax1.bar(*zip(*sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:10]))
ax1.set_title('10 most common words')
ax1.set_xlabel('Words')
plt.xticks(rotation=45)
ax1.set_ylabel('# of occurrences')
fig1.savefig('data/10_most_common_words.png')
fig1.show()

In [None]:
for df in [bias_df, clickbait_df, conspiracy_df, fake_df, hate_df, junksci_df, political_df, reliable_df, rumor_df, satire_df, unknown_df, unreliable_df]:
    type = df['type'].unique()[0]
    fig1 = plt.figure(figsize=(20, 5))
    ax1 = fig1.add_subplot(111)
    ax1.bar(*zip(*sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:10]))
    ax1.set_title(f'{type} articles 10 most common words')
    ax1.set_xlabel('Words')
    plt.xticks(rotation=45)
    ax1.set_ylabel('# of occurrences')
    fig1.savefig(f'data/{type}10_most_common_words.png')
    fig1.show()

In [None]:
def f1(x):
    if x in ['political', 'clickbait', 'reliable']: return 1
    else: return 0 

df['Group'] = df['type'].apply(f1)
df

In [None]:
vocab = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:10000]

In [None]:
def f3(x):
    list = []
    for word in x:
        list.append(word)
    string = ' '.join(list)
    return string
df['content'] = df['content'].apply(f3)

In [None]:
contentlist = df['content'].tolist()
grouplist = df['Group'].tolist()

In [None]:
df = df.drop(['Unnamed: 0', 'id', 'domain', 'url', 'scraped_at', 'inserted_at', 'updated_at', 'meta_keywords','source', 'title', 'distinct_words', 'type'], axis=1, inplace=True)

In [None]:
df.describe()

In [None]:
grouplist

In [None]:
y_pred

In [None]:
y_test

In [None]:
train_word_counts = sorted(train_word_counts.items(), key=lambda x: x[1], reverse=True)[:10000]
len(train_word_counts)

In [None]:
train.