In [1]:
import numpy as np
import pandas as pd
import warnings
import torch
import gdown

## Stacked Model
This notebook includes a combination of three factors -- credibility, political affiliation, and context veracity. Each step can be refered to its experimental notebook located in the notebook file from the root. The output contains a GridSearchCV and the best accuracy model and its parameters.

In [2]:
# Input is a Series/np.array
from transformers import BertTokenizer, BertModel, pipeline
def text_embedding(data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
    
    def get_bert_embeddings(data):
        tokens = tokenizer(data.tolist(), padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            embeddings = bert_model(**tokens).last_hidden_state.mean(dim=1)
        return embeddings

    batch_size = 128
    num_samples = len(data)
    num_batches = (num_samples + batch_size - 1) // batch_size

    embeddings_list = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        batch_data = data.iloc[start_idx:end_idx]
        batch_embeddings = get_bert_embeddings(batch_data)
        embeddings_list.append(batch_embeddings)

    embeddings = torch.cat(embeddings_list, dim=0).cpu().numpy()
    return embeddings

2023-12-11 20:43:49.071445: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Train credibility and reliability, for details, please refer to notebook/experimental_notebooks/credibility_reliability.ipynb
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from urllib.parse import urlparse
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
def get_username(instagram_url):
    parsed_url = urlparse(instagram_url)
    path_segments = parsed_url.path.strip('/').split('/')
    username = path_segments[-1]
    return username
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation and not char.isdigit()])
    tokens = nltk.word_tokenize(text)    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)
# reliability.csv contains social media user information
insta_df = pd.read_csv('../data/reliability.csv').drop(columns = 'Unnamed: 0').drop_duplicates(subset='username', keep='last')
social_media = pd.read_csv('../data/credibility_with_insta_username.csv').drop(columns = 'Unnamed: 0')
cleaned_social_media = social_media[social_media['instagram_uri'].str.contains('instagram', na=False)].reset_index().drop(columns = 'index')
cleaned_social_media['username'] = cleaned_social_media['instagram_uri'].apply(get_username)
unique_df = cleaned_social_media[['username', 'source']].drop_duplicates(subset='source', keep='last')
merged_df = pd.merge(insta_df, unique_df, on='username', how='inner').reset_index().drop(columns = 'index')
clean_df = pd.read_csv('../data/credibility.csv', index_col=None).drop(columns = 'Unnamed: 0')
cred_reliability = pd.merge(merged_df, clean_df, on='source', how='inner').reset_index().drop(columns='index')
cred_reliability['followees_to_followers_ratio'] = cred_reliability['followees'] / cred_reliability['followers'] + 1e-10
cred_reliability = cred_reliability[['source', 'source_history', 'mediacount', 'followees_to_followers_ratio', 'is_verified', 'credibility_score']]
cred_reliability = cred_reliability.dropna(subset=['source_history'])
cred_reliability = cred_reliability.replace([np.inf, -np.inf], np.nan)
cred_reliability = cred_reliability.dropna(subset=['followees_to_followers_ratio'])
cred_reliability['is_verified'] = cred_reliability['is_verified'].apply(lambda x: 0 if x else 1)
cred_reliability['preprocessed_source'] = cred_reliability['source_history'].apply(preprocess_text)
X = cred_reliability['preprocessed_source']
X = text_embedding(X)
X = np.concatenate((X[:, :50], cred_reliability[['mediacount', 'followees_to_followers_ratio', 'is_verified']].values), axis=1)
y = cred_reliability['credibility_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Previously validated Random Forrest has the best performance
cred_reliability_model = RandomForestRegressor()
cred_reliability_model.fit(X_train, y_train)
y_pred = cred_reliability_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error for {cred_reliability_model.__class__.__name__}: {mse}')

Mean Squared Error for RandomForestRegressor: 8.200894296021652


In [4]:
# Political Affiliation, for details, please refer to notebook/experimental_notebooks/political_affiliation_binary_classification.ipynb
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

tweet_df = pd.read_csv('../data/tweets_dataset.csv')
tweet_df_clean = tweet_df.drop(columns = 'Handle')
tweet_df_clean['Party'] = tweet_df_clean['Party'].apply(lambda x: 0 if x == 'Democrat' else 1)
tweet_df_clean['Tweet'] = tweet_df_clean['Tweet'].str.lower() 
tweet_df_clean['Tweet'] = tweet_df_clean['Tweet'].str.replace(r'http\S+', '') 
tweet_df_clean['Tweet'] = tweet_df_clean['Tweet'].str.replace(r'@\w+', '')
tweet_df_clean['Tweet'] = tweet_df_clean['Tweet'].str.replace(r'#\w+', '')
tweet_df_clean['Tweet'] = tweet_df_clean['Tweet'].str.replace(r'[^a-zA-Z\s]', '') 
stop_words = set(stopwords.words('english'))
tweet_df_clean['Tweet'] = tweet_df_clean['Tweet'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
X_train, X_test, y_train, y_test = train_test_split(tweet_df_clean['Tweet'], tweet_df_clean['Party'], test_size=0.2, random_state=42)
# Generate 1-gram to 3-gram to create more features
poli_vectorizer = CountVectorizer(ngram_range=(1, 3)) 
X_train_vectorized = poli_vectorizer.fit_transform(X_train)
X_test_vectorized = poli_vectorizer.transform(X_test)

# Previously validated MultinomialNB has the best performance
poli_affili_model = MultinomialNB()
poli_affili_model.fit(X_train_vectorized, y_train)
y_pred = poli_affili_model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8177770067083044


In [6]:
# Context Veracity
# For details about this preprocessed dataset, please refer to notebook/experimental_notebooks/context_veracity.ipynb
gdown.download(f"https://drive.google.com/uc?id=16ohLi5nbC0yYk6oNou8YArLQY2cxnQWm", "../data/context_shift_score.csv", quiet=True)
shift_df = pd.read_csv('../data/context_shift_score.csv').drop(columns = 'Unnamed: 0')
def calculate_contextual_drift(row):
    a = 0.4  # coefficient for topic_drift
    b = 0.4  # coefficient for sentiment_drift
    c = 0.1  # coefficient for ner_shift_count
    d = 0.1  # constant term
    return a * row['topic_drift'] + b * row['sentiment_drift'] + c * row['ner_shift_count'] + d
shift_df['contextual_drift'] = shift_df.apply(calculate_contextual_drift, axis=1)

In [7]:
# Combining the models and dataframe, predict on politifact
cred_reliability_no_label = cred_reliability.drop(columns = 'credibility_score')
merged_df = pd.merge(shift_df, cred_reliability_no_label, left_on='media', right_on='source', how='inner').reset_index().drop(columns='index')
label_mapping = {
    'false': 0,
    'barely-true': 1,
    'full-flop': 2,
    'half-flip': 3,
    'half-true': 4,
    'mostly-true': 5,
    'no-flip': 6,
    'pants-fire': 7,
    'true': 8
}
def convert_label(label):
    return label_mapping[label]
merged_df['label'] = merged_df['label'].apply(convert_label)

In [8]:
# Predict political affiliation from article
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation and not char.isdigit()])
    tokens = nltk.word_tokenize(text)    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)
merged_df['article'] = merged_df['article'].apply(preprocess_text)
transformed_article = poli_vectorizer.transform(merged_df['article'])
merged_df['poli_affiliation'] = poli_affili_model.predict(transformed_article)

In [9]:
# Predict Credibility-Reliability 
transformed_source = text_embedding(merged_df['preprocessed_source'])[:, :50]
combined_features = np.concatenate((transformed_source, merged_df[['mediacount', 'followees_to_followers_ratio', 'is_verified']].values), axis=1)
merged_df['cred_reliability'] = cred_reliability_model.predict(combined_features)
result_df = merged_df[['contextual_drift', 'poli_affiliation', 'cred_reliability', 'label']]

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = result_df[['contextual_drift', 'poli_affiliation', 'cred_reliability']]
y = result_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifiers = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=10000)
}

param_grids = {
    'Decision Tree': {'max_depth': [None, 5, 10, 15]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10]},
    'Logistic Regression': {'C': [0.001, 0.01, 0.1, 1, 10]}
}

best_accuracy = 0.0
best_model = None
best_params = None

for clf_name, clf in classifiers.items():
    print(f"Classifier: {clf_name}")
    
    # If classifier has hyperparameters, perform nested loop for hyperparameter tuning
    if clf_name in param_grids:
        for param_name, param_values in param_grids[clf_name].items():
            for param_value in param_values:
                # Set the hyperparameter value and fit the model
                setattr(clf, param_name, param_value)
                clf.fit(X_train, y_train)
                
                # Make predictions and evaluate the model
                y_pred = clf.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                
                # Check if the current set of hyperparameters achieved a better accuracy
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_model = clf_name
                    best_params = {param_name: param_value}
                
                print(f"{param_name}: {param_value}")
                print(f"Accuracy: {accuracy}")
                print("="*30)
    
    # If no hyperparameters, fit the model directly
    else:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        # Check if the current model achieved a better accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = clf_name
            best_params = None
            
        print(f"Accuracy: {accuracy}")
        print("="*30)

# Print the best results
print(f"Best Model: {best_model}")
print(f"Best Accuracy: {best_accuracy}")
print(f"Best Hyperparameters: {best_params}")
warnings.filterwarnings("ignore")

Classifier: Decision Tree
max_depth: None
Accuracy: 0.3327917682101273
max_depth: 5
Accuracy: 0.3254806390468454
max_depth: 10
Accuracy: 0.3254806390468454
max_depth: 15
Accuracy: 0.3279176821012727
Classifier: Random Forest
n_estimators: 50
Accuracy: 0.34524776604386676
n_estimators: 100
Accuracy: 0.3422691578662334
n_estimators: 200
Accuracy: 0.346872461413485
max_depth: None
Accuracy: 0.346872461413485
max_depth: 5
Accuracy: 0.3249390739236393
max_depth: 10
Accuracy: 0.33549959382615757
Classifier: Logistic Regression
C: 0.001
Accuracy: 0.30354725155699974
C: 0.01
Accuracy: 0.3051719469266179
C: 0.1
Accuracy: 0.3108583807202816
C: 1
Accuracy: 0.31112916328188467
C: 10
Accuracy: 0.3108583807202816
Best Model: Random Forest
Best Accuracy: 0.346872461413485
Best Hyperparameters: {'n_estimators': 200}
