Sources:
https://matplotlib.org/stable/tutorials/pyplot.html
https://pypi.org/project/langdetect/
https://www.geeksforgeeks.org/adding-value-labels-on-a-matplotlib-bar-chart/
https://neptune.ai/blog/vectorization-techniques-in-nlp-guide
https://scikit-learn.org/0.21/documentation.html
https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

In [None]:
# Misc imports
import pipreqsnb
import re
import os
import demoji

# ML imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# from imports
from deep_translator import GoogleTranslator
from langdetect import detect_langs
from tqdm.notebook import tqdm
from wordcloud import WordCloud
from textblob import TextBlob

# sklearn imports
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [None]:
# Generate requirements.txt
!pipreqsnb --force --use-local Source_code.ipynb

# Remove version numbers and duplicates
name = 'requirements.txt'

with open(name, 'r') as file:
    lines = file.readlines()

packages = set()

for line in lines:
    packageName, _ = line.split('==', 1)
    packages.add(packageName)

with open(name, 'w') as file:
    for packageName in packages:
        file.write(f'{packageName}\n')

In [None]:
# Import datasets
trainingData = pd.read_table("mediaeval-2015-trainingset.txt", sep="\t",lineterminator='\n', skiprows=(0),  header=(0))
testingData = pd.read_table("mediaeval-2015-testset.txt", sep="\t", lineterminator='\n', skiprows=(0),  header=(0))

# Data Analysis


In [None]:
trainingData.head()

In [None]:
testingData.head()

In [None]:
print("<----------------- TRAINING ----------------->\n")
trainingData.info()
print("\n\n<----------------- TESTING ----------------->\n")
testingData.info()

In [None]:
trainingData['tweetText'].describe()

In [None]:
testingData['tweetText'].describe()

In [None]:
# Real labels in training
print("Real count:", trainingData['label'].isin(['real']).sum(axis=0))
# Fake labels in training
print("Fake count:", trainingData['label'].isin(['fake', 'humor']).sum(axis=0))

# Conclusions

Columns: tweetId, tweetText, userId, imageId(s), username, timestamp, label<br>

### Data quality:
1. Tweets end in URLs
2. Emojis and special characters
3. Diverse languages (mixed languages)
4. Text with typos
5. All features non-null and strings

### Data bias:
1. 1901 duplicates in training
2. 49 duplicates in testing
3. 65.5% fake

### Training Data:
Size: 14277 entries<br>
Unique tweetText: 12376 entries<br>
Real count: 4921<br>
Fake count: 9356

### Testing Data:
Size: 3755 entries<br>
Unique tweetText: 3706 entries


# Data Visualisation


## Fake vs Real Count

In [None]:
# Map 'humor' label to 'fake'
trainingData['label'] = trainingData['label'].replace('humor', 'fake')

# Plotting the tweet counts by label for the training dataset
plt.figure(figsize=(8, 5))

trainingData['label'].value_counts().plot(kind='bar', color=['blue', 'red'])
plt.title('Tweet Counts by Label (Training Data)')
plt.xlabel('Label')
plt.ylabel('Tweet Count')
plt.xticks(rotation=0)

plt.savefig('figures/CountsByLabelTraining.png', bbox_inches='tight')
plt.show()

In [None]:
# Plotting the tweet counts by label for the testing dataset
plt.figure(figsize=(8, 5))

testingData['label'].value_counts().plot(kind='bar', color=['blue', 'red', 'green'])
plt.title('Tweet Counts by Label (Testing Data)')
plt.xlabel('Label')
plt.ylabel('Tweet Count')
plt.xticks(rotation=0)

plt.savefig('figures/CountsByLabelTesting.png', bbox_inches='tight')
plt.show()

## Tweet Length

In [None]:
# Calculate the length of each tweet and add it as a new column
trainingData['tweetLength'] = trainingData['tweetText'].apply(len)
testingData['tweetLength'] = testingData['tweetText'].apply(len)

# Plot the distribution of tweet lengths
plt.figure(figsize=(10, 6))
plt.hist(trainingData['tweetLength'], bins=5000, color='black')
plt.title('Distribution of Tweet Lengths (Training Data)')
plt.xlabel('Tweet Length')
plt.ylabel('Frequency')
plt.grid(True)

plt.savefig('figures/TweetLength.png', bbox_inches='tight')
plt.show()

In [None]:
# Filter out tweets longer than 150 characters
trainingDataFiltered = trainingData[trainingData['tweetLength'] <= 150]

# Plot the distribution of filtered tweet lengths
plt.figure(figsize=(10, 6))
plt.hist(trainingDataFiltered['tweetLength'], bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of Tweet Lengths (Training Data)')
plt.xlabel('Tweet Length')
plt.ylabel('Frequency')
plt.grid(True)

plt.savefig('figures/TweetLength150.png', bbox_inches='tight')
plt.show()

In [None]:
# Get string greater than 500 characters
longTweet = trainingData[trainingData['tweetLength'] > 500].head(1)

# Print the full text of the identified tweet
for _, tweet in longTweet.iterrows():
    print(f"Tweet ID: {tweet['tweetId']}")
    print(f"Tweet Text:\n{tweet['tweetText']}\n")

## Language Analysis

In [None]:
# Clean text
def cleanText(text):
    # Remove emojis
    text = re.sub(r'\\u[0-9a-fA-F]+', '', text)
    
    # Remove non-ASCII characters
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # Remove links
    text = re.sub(r'http\S+', '', text)
    
    # Remove hashtags
    text = re.sub(r'#\S+', '', text)
    
    # Remove tags
    text = re.sub(r'@\S+', '', text)
    
    return text

trainingData['tweetTextClean'] = trainingData['tweetText'].apply(cleanText)
testingData['tweetTextClean'] = testingData['tweetText'].apply(cleanText)

In [None]:
# Language detection function using langdetect from https://pypi.org/project/langdetect/
def languageDetection(text):
    try:
        # If text is too short mark it as error
        if len(text) < 3:
            return 'err', 0.0
            
        langs = detect_langs(text) 
        for item in langs: 
            return item.lang, item.prob
        
        # Error if no language detected
        return 'err', 0.0
    except Exception as e:
        return 'err', 0.0
    
tqdm.pandas(desc="Language Detection Training")
trainingData[['language', 'confidence']] = trainingData['tweetTextClean'].progress_apply(lambda x: pd.Series(languageDetection(x), dtype=object))

tqdm.pandas(desc="Language Detection Testing")
testingData[['language', 'confidence']] = testingData['tweetTextClean'].progress_apply(lambda x: pd.Series(languageDetection(x), dtype=object))

In [None]:
# Plotting the language distribution
plt.figure(figsize=(10, 6))

languageCounts = trainingData['language'].value_counts()

languageCounts.plot(kind='bar', color='skyblue')
plt.title('Language Distribution in Training Data')
plt.xlabel('Language')
plt.ylabel('Tweet Count')
plt.xticks(rotation=0)

plt.savefig('figures/LanguageDistribution.png', bbox_inches='tight')
plt.show()

In [None]:
bins = [i/10 for i in range(11)]  # Bins from 0.0 to 1.0 in increments of 0.1

# Cut the into bins
trainingData['confidenceBin'] = pd.cut(trainingData['confidence'], bins=bins, include_lowest=True)
testingData['confidenceBin'] = pd.cut(testingData['confidence'], bins=bins, include_lowest=True)

# Count tweets in bins
confidenceCounts = trainingData['confidenceBin'].value_counts().sort_index()

print("Count of tweets for each confidence bin:")
print(confidenceCounts)

In [None]:
# Create a dictionary to store samples for each confidence bin
samplesByBin = {}

# Get 5 random texts from bin
def getSamples(group):
    # Check if the group has enough samples to take 5 random samples
    if len(group) >= 5:
        return group.sample(5)
    else:
        return group

for value, group in trainingData.groupby('confidenceBin'):
    samplesByBin[value] = getSamples(group)

# Display samples
for binValue, samples in samplesByBin.items():
    print(f"\nConfidence Bin: {binValue}")
    for index, row in samples.iterrows():
        print(f"Language: {row['language']}, Text: {row['tweetTextClean']}")

## Timestamp Analysis

In [None]:
# Convert the timestamp to datetime
tqdm.pandas(desc="Converting timestamps")
trainingData['timestamp'] = trainingData['timestamp'].progress_apply(lambda x: pd.to_datetime(x.replace(' ', ''), format="%a%b%d%H:%M:%S+0000%Y"))
testingData['timestamp'] = testingData['timestamp'].progress_apply(lambda x: pd.to_datetime(x.replace(' ', ''), format="%a%b%d%H:%M:%S+0000%Y"))

# Extract components into columns
trainingData['year'] = trainingData['timestamp'].dt.year
trainingData['month'] = trainingData['timestamp'].dt.month
trainingData['day'] = trainingData['timestamp'].dt.day
trainingData['hour'] = trainingData['timestamp'].dt.hour

testingData['year'] = testingData['timestamp'].dt.year
testingData['month'] = testingData['timestamp'].dt.month
testingData['day'] = testingData['timestamp'].dt.day
testingData['hour'] = testingData['timestamp'].dt.hour

In [None]:
# Create subplots for each timestamp component
plt.figure(figsize=(16, 8))

# Year Distribution
plt.subplot(2, 2, 1)
trainingData[trainingData['label'] == 'fake']['year'].value_counts().sort_index().plot(kind='bar', color='lightcoral', alpha=0.8, label='Fake')
trainingData[trainingData['label'] == 'real']['year'].value_counts().sort_index().plot(kind='bar', color='skyblue', alpha=0.8, label='Real')
plt.title('Year Distribution')
plt.legend()

# Month Distribution
plt.subplot(2, 2, 2)
trainingData[trainingData['label'] == 'fake']['month'].value_counts().sort_index().plot(kind='bar', color='lightcoral', alpha=0.8, label='Fake')
trainingData[trainingData['label'] == 'real']['month'].value_counts().sort_index().plot(kind='bar', color='skyblue', alpha=0.8, label='Real')
plt.title('Month Distribution')
plt.legend()

# Day Distribution
plt.subplot(2, 2, 3)
trainingData[trainingData['label'] == 'fake']['day'].value_counts().sort_index().plot(kind='bar', color='lightcoral', alpha=0.8, label='Fake')
trainingData[trainingData['label'] == 'real']['day'].value_counts().sort_index().plot(kind='bar', color='skyblue', alpha=0.8, label='Real')
plt.title('Day Distribution')
plt.legend()

# Hour Distribution
plt.subplot(2, 2, 4)
trainingData[trainingData['label'] == 'fake']['hour'].value_counts().sort_index().plot(kind='bar', color='lightcoral', alpha=0.8, label='Fake')
trainingData[trainingData['label'] == 'real']['hour'].value_counts().sort_index().plot(kind='bar', color='skyblue', alpha=0.8, label='Real')
plt.title('Hour Distribution')
plt.legend()

plt.tight_layout()

plt.savefig('figures/TimestampDistributionFakeVsReal.png', bbox_inches='tight')
plt.show()

## Text content variation

In [None]:
# Clean text
fakeCleaned = ' '.join(trainingData[trainingData['label'] == 'fake']['tweetTextClean'].apply(cleanText))
realCleaned = ' '.join(trainingData[trainingData['label'] == 'real']['tweetTextClean'].apply(cleanText))

# Generate word clouds
wordcloudFake = WordCloud(width=800, height=400, background_color='white').generate(fakeCleaned)
wordcloudReal = WordCloud(width=800, height=400, background_color='white').generate(realCleaned)

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.imshow(wordcloudFake, interpolation='bilinear')
plt.title('Word Cloud for Fake Tweets')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(wordcloudReal, interpolation='bilinear')
plt.title('Word Cloud for Real Tweets')
plt.axis('off')

plt.savefig('figures/WordClouds.png', bbox_inches='tight')
plt.show()

## URL Analysis

In [None]:
# Add column showing presence of URL
urlPattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+])+')
trainingData['containsProperUrl'] = trainingData['tweetText'].apply(lambda x : 1 if urlPattern.search(x.replace('\\', '')) else 0)
testingData['containsProperUrl'] = testingData['tweetText'].apply(lambda x : 1 if urlPattern.search(x.replace('\\', '')) else 0)

urlCounts = trainingData['containsProperUrl'].value_counts()
print("Tweet Counts with/without URLs:")
print("Contains URL: ", urlCounts.get(1, 0))
print("Does not contain URL: ", urlCounts.get(0, 0))

In [None]:
# Sample of tweets with inproper URLs
tweetsWithoutUrl = trainingData[trainingData['containsProperUrl'] == 0].head(5)

for index, row in tweetsWithoutUrl.iterrows():
    print(f"Tweet ID: {row['tweetId']}")
    print(f"Username: {row['username']}")
    print(f"Timestamp: {row['timestamp']}")
    print(f"Tweet Text: {row['tweetText']}")
    print("="*50)

## UserID Analysis

In [None]:
tweetCountsByUser = trainingData['userId'].value_counts()

# Categorize users based on tweet counts
userBins = pd.cut(tweetCountsByUser, bins=[0, 1, 2, 3, float('inf')], labels=['1 tweet', '2 tweets', '3 tweets', '3+ tweets'])

# Count users in each category
userCountsInBins = userBins.value_counts()

plt.figure(figsize=(8, 6))
userCountsInBins.sort_index().plot(kind='bar', color='skyblue')
plt.title('User Distribution by Tweet Counts (Training Data)')
plt.xlabel('Number of Tweets')
plt.ylabel('Number of Users')

plt.savefig('figures/UserDistribution.png', bbox_inches='tight')
plt.show()

In [None]:
# Filter users with 4 or more tweets
filteredUsers = tweetCountsByUser[tweetCountsByUser >= 4]

# Calculate the percentage of total users
percentageFilteredUsers = (len(filteredUsers) / len(tweetCountsByUser)) * 100

plt.figure(figsize=(8, 6))
filteredUsers.value_counts().sort_index().plot(kind='bar', color='skyblue')
plt.title('Distribution of Users with 4+ Tweets (Training Data)')
plt.xlabel('Number of Tweets')
plt.ylabel('Number of Users')
plt.text(0.4, 0.9, f'{percentageFilteredUsers:.1f}% of Total Users', transform=plt.gca().transAxes, fontsize=10, color='red')

plt.savefig('figures/UserDistribution4Plus.png', bbox_inches='tight')
plt.show()

## ImageID Analysis

In [None]:
trainingData['imageId_short'] = trainingData['imageId(s)'].str[:5]
testingData['imageId_short'] = testingData['imageId(s)'].str[:5]

# Map humor label to fake
trainingData['label'] = trainingData['label'].replace('humor', 'fake')
testingData['label'] = testingData['label'].replace('humor', 'fake')

# Count real and fake
imageCounts = trainingData.groupby(['imageId_short', 'label']).size().unstack(fill_value=0)
imageCountsTesting = testingData.groupby(['imageId_short', 'label']).size().unstack(fill_value=0)

# Calculate total tweet count for each imageId
imageCounts['total'] = imageCounts.sum(axis=1)
imageCountsTesting['total'] = imageCountsTesting.sum(axis=1)

# Sort
imageCountsSorted = imageCounts.sort_values(by='total', ascending=False)
imageCountsSortedTesting = imageCountsTesting.sort_values(by='total', ascending=False)

# Plot the counts testing
imageCountsSortedTesting.drop('total', axis=1).plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Tweet Counts by ImageId and Label (Testing Data)')
plt.xlabel('ImageId')
plt.ylabel('Tweet Count')
plt.legend(title='Label', loc='upper right')

plt.savefig('figures/CountsImageIdTesting.png', bbox_inches='tight')
plt.show()

# Plot the counts training
imageCountsSorted.drop('total', axis=1).plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Tweet Counts by ImageId and Label (Training Data)')
plt.xlabel('ImageId')
plt.ylabel('Tweet Count')
plt.legend(title='Label', loc='upper right')

plt.savefig('figures/CountsImageId.png', bbox_inches='tight')
plt.show()

# Exclude the first row
imageCountsSortedNoFirst = imageCountsSorted.iloc[1:]

# Plot the counts training minus first
imageCountsSortedNoFirst.drop('total', axis=1).plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Tweet Counts by ImageId and Label (Excluding Hurricane Sandy)')
plt.xlabel('ImageId')
plt.ylabel('Tweet Count')
plt.legend(title='Label', loc='upper right')

plt.savefig('figures/CountsImageIdNoFirst.png', bbox_inches='tight')
plt.show()

In [None]:
# Count the unique imageId
uniqueImageIds = trainingData['imageId(s)'].nunique()

print("Number of unique image IDs (Training Data):", uniqueImageIds)

# Pre-processing

In [None]:
trainingData.head(1)

In [None]:
testingData.head(1)

## Remove rows with incorrectly parsed data and low quality data

In [None]:
# Remove malformed tweets with length greater than 150
filteredTrainingData = trainingData[trainingData['tweetLength'] <= 150]

# Remove tweets with inproper URL
filteredTrainingData = filteredTrainingData[filteredTrainingData['containsProperUrl'] == 1]

# Remove tweets with language detection confidence below 0.7
filteredTrainingData = filteredTrainingData[filteredTrainingData['confidence'] > 0.7]

# Remove tweets with failed language detection
filteredTrainingData = filteredTrainingData[filteredTrainingData['language'] != 'err']

# No filtering for testingData
filteredTestingData = testingData

## Translate text (clean)

In [None]:
# Save a CSV file
trainingFileName = "dataset/translated_training_data.csv"
testingFileName = "dataset/translated_testing_data.csv"

progress = None

def englishTranslate(row):
    text = row['tweetTextClean']
    language = row['language']
    
    # Translate to English only if the language is not English
    if language != 'en':
        try:
            translated = GoogleTranslator(source=language, target='en').translate(text)
        except Exception as e:
            try:
                # Try translating with google language detection
                translated = GoogleTranslator(source='auto', target='en').translate(text)
            except Exception as e:
                translated = None
    else:
        translated = text  # Keep text if in English
    
    # Update progress bar
    progress.update(1)

    return translated

def runTranslation(dataset, name, progressType):
    global progress 
    
    tweetTextClean = dataset['tweetTextClean']
    languages = dataset['language']

    # Initialize progress bar
    iterations = len(tweetTextClean)
    progress = tqdm(total=iterations, desc=progressType + " Progress", position=0, leave=True)
    
    # Apply the translation function
    dataset['translatedText'] = dataset.apply(englishTranslate, axis=1)

    # Remove entries with translation error
    dataset = dataset.dropna(subset=['translatedText'])

    # Close progress
    progress.close()

    # Save the result file
    dataset.to_csv(name, index=False)

# Check if the training file exists before running translation
if not os.path.exists(trainingFileName):
    runTranslation(filteredTrainingData, trainingFileName, "Training")
    
# Check if the testing file exists before running translation
if not os.path.exists(testingFileName):
    runTranslation(filteredTestingData, testingFileName, "Testing")

translatedTrainingData = pd.read_csv(trainingFileName)
translatedTestingData = pd.read_csv(testingFileName)

## Translate text (original)

In [None]:
# Save the DataFrame to a CSV file
trainingFileName = "dataset/original_translated_training_data.csv"
testingFileName = "dataset/original_translated_testing_data.csv"

progress = None

def englishTranslate(row):
    text = row['tweetText']
    language = row['language']
    
    # Translate to English only if the language is not English
    if language != 'en':
        try:
            translated = GoogleTranslator(source=language, target='en').translate(text)
        except Exception as e:
            try:
                # Try translating with google language detection
                translated = GoogleTranslator(source='auto', target='en').translate(text)
            except Exception as e:
                translated = None
    else:
        translated = text  # Keep text if in English
    
    # Update progress bar
    progress.update(1)

    return translated

def runTranslation(dataset, name, progressType):
    global progress 
    
    tweetText = dataset['tweetText']
    languages = dataset['language']

    # Initialize progress bar
    iterations = len(tweetText)
    progress = tqdm(total=iterations, desc=progressType + " Progress", position=0, leave=True)
    
    # Apply the translation function
    dataset['originalTranslatedText'] = dataset.apply(englishTranslate, axis=1)

    # Remove entries with translation error
    dataset = dataset.dropna(subset=['originalTranslatedText'])

    # Close progress bar
    progress.close()

    # Save file after translation
    dataset.to_csv(name, index=False)

# Check if the training file exists before running translation
if not os.path.exists(trainingFileName):
    runTranslation(filteredTrainingData, trainingFileName, "Training")
    
# Check if the testing file exists before running translation
if not os.path.exists(testingFileName):
    runTranslation(filteredTestingData, testingFileName, "Testing")

translatedTrainingData = pd.read_csv(trainingFileName)
translatedTestingData = pd.read_csv(testingFileName)

## Feature engineering - symbol counts

In [None]:
# Function to count features for each row
def countInText(text):
    # Count hashtags
    hashtags = len(re.findall(r'#\w+', text))
    
    # Find all occurrences of emojis using demoji
    emojisList = demoji.findall_list(text)
    
    # Count the total number of occurrences of emojis
    emojisCount = len(emojisList)
    
    # Count mentions
    mentions = len(re.findall(r'@\w+', text))
    
    # Count exclamation marks
    exclamationMarks = len(re.findall(r'!', text))
    
    # Count question marks
    questionMarks = len(re.findall(r'\?', text))
    
    # Check if it is a retweet
    retweet = 0
    if (len(re.findall(r'RT', text)) > 0):
        retweet = 1
    
    return pd.Series({
        'hashtagsCount': hashtags,
        'emojisCount': emojisCount,
        'mentionsCount': mentions,
        'exclamationMarksCount': exclamationMarks,
        'questionMarksCount': questionMarks,
        'retweet': retweet
    })

tqdm.pandas(desc="Processing Training", position=0, leave=True)
featureCountsTraining = translatedTrainingData['tweetText'].progress_apply(countInText)

tqdm.pandas(desc="Processing Testing", position=0, leave=True)
featureCountsTesting = translatedTestingData['tweetText'].progress_apply(countInText)

# Concatenate the feature counts
translatedTrainingData = pd.concat([translatedTrainingData, featureCountsTraining], axis=1)
translatedTestingData = pd.concat([translatedTestingData, featureCountsTesting], axis=1)

In [None]:
# Display a sample of 5 tweets with emoji counts
tweetsEmojis = translatedTrainingData[translatedTrainingData['emojisCount'] > 0]
sampleTweets = tweetsEmojis.sample(5)

for index, row in sampleTweets.iterrows():
    print(f"Tweet: {row['tweetText']}")
    print(f"Emoji Count: {row['emojisCount']}")
    print("-" * 40)

## Feature engineering - sentiment analysis

In [None]:
# Function to perform sentiment analysis
def sentimentAnalysis(text):
    analysis = TextBlob(text)
    
    # Ensure sentimentPolarity is only positive
    sentiment_polarity = analysis.sentiment.polarity + 1 if analysis.sentiment.polarity < 0 else analysis.sentiment.polarity
    
    # Return the sentiment polarity (0 to 2) and subjectivity (0 to 1)
    return pd.Series({
        'sentimentPolarity': analysis.sentiment.polarity,
        'sentimentSubjectivity': analysis.sentiment.subjectivity
    })

tqdm.pandas(desc="Sentiment Analysis Training")
sentimentAnalysisTraining = translatedTrainingData['tweetTextClean'].progress_apply(sentimentAnalysis)

tqdm.pandas(desc="Sentiment Analysis Testing")
sentimentAnalysisTesting = translatedTestingData['tweetTextClean'].progress_apply(sentimentAnalysis)

# Concatenate the sentiment analysis result with the original DataFrame
translatedTrainingData = pd.concat([translatedTrainingData, sentimentAnalysisTraining], axis=1)
translatedTestingData = pd.concat([translatedTestingData, sentimentAnalysisTesting], axis=1)

## Select relevant columns

In [None]:
selectColumns = ['label','translatedText', 'originalTranslatedText', 'tweetText', 'retweet', 'tweetLength', 'year', 'month', 'day', 'hour', 'hashtagsCount', 'emojisCount', 'mentionsCount', 'exclamationMarksCount', 'questionMarksCount', 'sentimentPolarity', 'sentimentSubjectivity']
processedTrainingData = translatedTrainingData[selectColumns]
processedTestingData = translatedTestingData[selectColumns]

processedTrainingFile = "dataset/processed_training_data.csv"
processedTestingFile = "dataset/processed_testing_data.csv"

if not os.path.exists(processedTrainingFile):
    processedTrainingData.to_csv(processedTrainingFile, index=False)
    
if not os.path.exists(processedTestingFile):
    processedTestingData.to_csv(processedTestingFile, index=False)

# Model Training

In [None]:
print("Training")
print("Real count:", processedTrainingData['label'].isin(['real']).sum(axis=0))
print("Fake count:", processedTrainingData['label'].isin(['fake', 'humor']).sum(axis=0))
print("Total count:", len(processedTrainingData))

print()

print("Testing")
print("Real count:", processedTestingData['label'].isin(['real']).sum(axis=0))
print("Fake count:", processedTestingData['label'].isin(['fake', 'humor']).sum(axis=0))
print("Total count:", len(processedTestingData))

In [None]:
processedTrainingData.head()

In [None]:
processedTestingData.head()

## Comparison pipeline for classifiers with different vectorizers

In [None]:
# Vectorizers https://neptune.ai/blog/vectorization-techniques-in-nlp-guide

# Get text feature
XText = processedTrainingData['tweetText']
XTextTest = processedTestingData['tweetText']

# Labels
YTrain = processedTrainingData['label']
YTest = processedTestingData['label']

# Convert labels to numerical values
YTrain = YTrain.map({'fake': 0, 'real': 1})
YTest = YTest.map({'fake': 0, 'real': 1})

# Calculate class weights
classWeights = compute_class_weight('balanced', classes=[0, 1], y=YTrain)

# Set up classifiers
classifiers = {
    'LinearSVC': LinearSVC(class_weight='balanced', C=1.0, dual=True),
    'MultinomialNB': MultinomialNB(alpha=0.01, class_prior=classWeights),
    'RandomForest': RandomForestClassifier(n_estimators=100, class_weight='balanced'),
    'LogisticRegression': LogisticRegression(class_weight='balanced', C=1.0, solver='liblinear'),
}

# Set up vectorizers
vectorizers = {
    'CountVectorizer': CountVectorizer(max_features=1000),
    'TfidfVectorizer': TfidfVectorizer(max_features=1000),
    'HashingVectorizer': HashingVectorizer(n_features=1000, alternate_sign=False)
}

results = {}

for classifierName, classifier in classifiers.items():
    for methodName, vectorizer in vectorizers.items():
        # Build pipeline
        pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', classifier),
        ])

        # Train classifier with progress
        with tqdm(total=100, desc=f'Training {classifierName}_{methodName}', position=0, leave=True) as pbar:
            pipeline.fit(XText, YTrain)
            pbar.update(100)

        # Make predictions on the test set
        YPred = pipeline.predict(XTextTest)
        
        # Evaluate the performance
        f1 = f1_score(YTest, YPred)
        confusion = confusion_matrix(YTest, YPred)
        classification = classification_report(YTest, YPred)

        results[f'{classifierName}_{methodName}'] = {
            'f1_score': f1,
            'confusion_matrix': confusion,
            'classification_report': classification
        }
        
# Extract F1 score values from results
f1Values = np.zeros((len(classifiers), len(vectorizers)))

for i, classifierName in enumerate(classifiers.keys()):
    for j, methodName in enumerate(vectorizers.keys()):
        key = f'{classifierName}_{methodName}'
        f1Values[i, j] = results[key]['f1_score']

fig, ax = plt.subplots(figsize=(12, 8))

width = 0.2
positions = np.arange(len(vectorizers))

for i, classifierName in enumerate(classifiers.keys()):
    bars = ax.bar(positions + i * width, f1Values[i, :], width, label=classifierName)
    for bar, value in zip(bars, f1Values[i, :]):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, height, f'{value:.2f}', ha='center', va='bottom')

# Set axis labels and legend
ax.set_xticks(positions + (len(classifiers) - 1) * width / 2)
ax.set_xticklabels(vectorizers.keys())
ax.set_xlabel('Vectorization Method')
ax.set_ylabel('F1 Score')

plt.title('F1 Score of Classifiers with Different Vectorization Methods')
plt.legend(title='Classifier')

plt.savefig('figures/ComparisonPipeline.png', bbox_inches='tight')
plt.show()

## LinearSVC with optimized Vectorizers

In [None]:
# Split training and validation
xTrain, xVal, yTrain, yVal = train_test_split(
    processedTrainingData['tweetText'], 
    processedTrainingData['label'], 
    test_size=0.2, 
    random_state=42
)

# Testing
xTextTest = processedTestingData['tweetText']
yTest = processedTestingData['label']

# Convert labels to numerical values
yTrain = yTrain.map({'fake': 0, 'real': 1})
yVal = yVal.map({'fake': 0, 'real': 1})
yTest = yTest.map({'fake': 0, 'real': 1})

# Calculate class weights
classWeights = compute_class_weight('balanced', classes=[0, 1], y=yTrain)

# Set up vectorizers
vectorizers = {
    'CountVectorizer': CountVectorizer(max_features=4000, stop_words='english'),
    'TfidfVectorizer': TfidfVectorizer(stop_words='english'),
}

# Set up LinearSVC classifier
linearSvcClassifier = LinearSVC(class_weight='balanced', C=1.0, dual=True)

# Results dictionary
results = {}

for methodName, vectorizer in vectorizers.items():
    # Build pipeline
    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', linearSvcClassifier),
    ])

    # Train classifier
    pipeline.fit(xTrain, yTrain)

    # Make predictions on the validation set
    yValPred = pipeline.predict(xVal)

    # Evaluate the performance on the validation set
    f1ScoreVal = f1_score(yVal, yValPred, average='weighted')
    confusionMatVal = confusion_matrix(yVal, yValPred)
    classificationRepVal = classification_report(yVal, yValPred)

    # Make predictions on the test set
    yTestPred = pipeline.predict(xTextTest)

    # Evaluate the performance on the test set
    f1ScoreTest = f1_score(yTest, yTestPred, average='weighted')
    confusionMatTest = confusion_matrix(yTest, yTestPred)
    classificationRepTest = classification_report(yTest, yTestPred)

    results[f'LinearSVC_{methodName}'] = {
        'f1ScoreVal': f1ScoreVal,
        'confusionMatrixVal': confusionMatVal,
        'classificationReportVal': classificationRepVal,
        'f1ScoreTest': f1ScoreTest,
        'confusionMatrixTest': confusionMatTest,
        'classificationReportTest': classificationRepTest,
    }

for key, value in results.items():
    print(f"{key}:\nValidation F1 Score: {value['f1ScoreVal']}\n{value['classificationReportVal']}\n")
    print(f"Test F1 Score: {value['f1ScoreTest']}\n{value['classificationReportTest']}\n")

## LinearSVC with additional features

In [None]:
# Define numeric features
numericFeatures = ['retweet', 'hashtagsCount',
                     'emojisCount', 'mentionsCount', 'exclamationMarksCount', 'questionMarksCount',
                     'sentimentPolarity', 'sentimentSubjectivity']

# Split training and validation
xTrain, xVal, yTrain, yVal = train_test_split(
    processedTrainingData[['tweetText'] + numericFeatures], 
    processedTrainingData['label'], 
    test_size=0.2, 
    random_state=42
)

# Testing
xTextTest = processedTestingData[['tweetText'] + numericFeatures]
yTest = processedTestingData['label']

# Convert labels to numerical values
yTrain = yTrain.map({'fake': 0, 'real': 1})
yVal = yVal.map({'fake': 0, 'real': 1})
yTest = yTest.map({'fake': 0, 'real': 1})

# Calculate class weights
classWeights = compute_class_weight('balanced', classes=[0, 1], y=yTrain)

# Set up vectorizers
vectorizers = {
    'countVectorizer': CountVectorizer(max_features=4000, stop_words='english'),
    'tfidfVectorizer': TfidfVectorizer(max_features=10000, stop_words='english'),
}

# Set up LinearSVC classifier
linearSvcClassifier = LinearSVC(class_weight='balanced', C=1.0, dual=True, max_iter=100000)

# Transformer for selecting numeric features
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, x, y=None):
        return self

    def transform(self, x):
        return x[self.columns]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('text', vectorizers['countVectorizer'], 'tweetText'),
        ('numeric', StandardScaler(), numericFeatures),
    ],
    remainder='drop'
)

# Pipeline setup
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', linearSvcClassifier),
])

# Cross-validation
cvScores = cross_val_score(pipeline, xTrain, yTrain, cv=5, scoring='f1')

# Train classifier on the entire training set
pipeline.fit(xTrain, yTrain)

# Make predictions on the validation set
yValPred = pipeline.predict(xVal)

# Evaluate the performance on the validation set
f1Val = f1_score(yVal, yValPred)
confusionMatVal = confusion_matrix(yVal, yValPred)
classificationRepVal = classification_report(yVal, yValPred)

# Make predictions on the test set
yTestPred = pipeline.predict(xTextTest)

# Evaluate the performance on the test set
f1Test = f1_score(yTest, yTestPred)
confusionMatTest = confusion_matrix(yTest, yTestPred)
classificationRepTest = classification_report(yTest, yTestPred)

print(f"Cross-Validation Scores: {cvScores}")
print(f"\nValidation F1 Score: {f1Val}\n{classificationRepVal}\n")
print(f"Test F1 Score: {f1Test}\n{classificationRepTest}\n")

## Naive Bayes with optimized TD-IDF

In [None]:
# Split training and validation
xTrain, xVal, yTrain, yVal = train_test_split(
    processedTrainingData['tweetText'], 
    processedTrainingData['label'], 
    test_size=0.2,
    random_state=42
)

# Testing
xTextTest = processedTestingData['tweetText']
yTest = processedTestingData['label']

# Convert labels to numerical values
yTrain = yTrain.map({'fake': 0, 'real': 1})
yVal = yVal.map({'fake': 0, 'real': 1})
yTest = yTest.map({'fake': 0, 'real': 1})

# Calculate class weights
classWeights = compute_class_weight('balanced', classes=[0, 1], y=yTrain)

# Set up vectorizers
vectorizers = {
    'tfidfVectorizer': TfidfVectorizer(max_features=1000, stop_words='english')
}

# Set up Naive Bayes
naiveBayesClassifier = MultinomialNB(alpha=0.1, class_prior=classWeights)

# Results dictionary
results = {}

for methodName, vectorizer in vectorizers.items():
    # Build pipeline
    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', naiveBayesClassifier),
    ])

    # Train classifier
    pipeline.fit(xTrain, yTrain)

    # Make predictions on the validation set
    yValPred = pipeline.predict(xVal)

    # Evaluate the performance on the validation set
    f1Val = f1_score(yVal, yValPred)
    confusionMatVal = confusion_matrix(yVal, yValPred)
    classificationRepVal = classification_report(yVal, yValPred)

    # Make predictions on the test set
    yTestPred = pipeline.predict(xTextTest)

    # Evaluate the performance on the test set
    f1Test = f1_score(yTest, yTestPred)
    confusionMatTest = confusion_matrix(yTest, yTestPred)
    classificationRepTest = classification_report(yTest, yTestPred)

    results[f'MultinomialNB_{methodName}'] = {
        'f1Val': f1Val,
        'confusionMatrixVal': confusionMatVal,
        'classificationReportVal': classificationRepVal,
        'f1Test': f1Test,
        'confusionMatrixTest': confusionMatTest,
        'classificationReportTest': classificationRepTest,
    }

# Print results
for key, value in results.items():
    print(f"{key}:\nValidation F1 Score: {value['f1Val']}\n{value['classificationReportVal']}\n")
    print(f"Test F1 Score: {value['f1Test']}\n{value['classificationReportTest']}\n")

## Final Scores

In [None]:
f1scores = {'LinearSVC': 0.8620505992010652, 'MultinomialNB': 0.7346807828039781}

fig, ax = plt.subplots(figsize=(8, 6))
bars = ax.bar(f1scores.keys(), f1scores.values(), color=['blue', 'green', 'orange'])

for bar, value in zip(bars, f1scores.values()):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, height, f'{value:.4f}', ha='center', va='bottom')

ax.set_xlabel('Classifiers with Vectorization Methods')
ax.set_ylabel('F1 Score')
plt.title('F1 Score of Classifiers')

plt.savefig('figures/FinalResults.png', bbox_inches='tight')
plt.show()