# Imports and Installs

In [1]:
# Installing additional libraries for text preprocessing
!pip install -q preprocessor
!pip install -q contractions
!pip install -q optuna
!pip install holidays
!pip install gensim

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for preprocessor (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m


In [2]:
# Connect to Google Drive
from google.colab import files, drive
drive.mount('/content/drive')

# Basic Python
import pandas as pd
import numpy as np
from datetime import datetime
import holidays

# Text Preprocessing
import re
import preprocessor
import contractions
from tqdm import tqdm
tqdm.pandas()

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Feature Vectorization
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

# Optimization
import optuna
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.metrics import f1_score, classification_report, make_scorer

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='optuna.distributions')

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


# Get The Data

In [3]:
# Prompt the user to upload the file
uploaded = files.upload()

# Assuming the file is named 'trump_train.tsv'
file_name = list(uploaded.keys())[0]

# Define the column names
column_names = ['tweet id', 'user handle', 'tweet text', 'time stamp', 'device']

# Load the TSV file into a DataFrame with specified column names
df = pd.read_csv(file_name, sep='\t', names=column_names, header=None)

# Display the first few rows of the DataFrame
print(df.head())

# Display information about the DataFrame
print(df.info())

# Display summary statistics of the DataFrame
print(df.describe())

Saving trump_train.tsv to trump_train.tsv
             tweet id      user handle  \
0  845974102619906048  realDonaldTrump   
1  846166053663191040  realDonaldTrump   
2  835814988686233601  realDonaldTrump   
3  835817351178301440  realDonaldTrump   
4  835916511944523777  realDonaldTrump   

                                          tweet text           time stamp  \
0  Democrats are smiling in D.C. that the Freedom...  2017-03-26 15:21:58   
1  General Kelly is doing a great job at the bord...  2017-03-27 04:04:42   
2  The race for DNC Chairman was, of course, tota...  2017-02-26 13:33:16   
3  For first time the failing @nytimes will take ...  2017-02-26 13:42:39   
4  Russia talk is FAKE NEWS put out by the Dems, ...  2017-02-26 20:16:41   

    device  
0   iphone  
1   iphone  
2  android  
3  android  
4  android  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3156 entries, 0 to 3155
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------      

In [4]:
df.head()

Unnamed: 0,tweet id,user handle,tweet text,time stamp,device
0,845974102619906048,realDonaldTrump,Democrats are smiling in D.C. that the Freedom...,2017-03-26 15:21:58,iphone
1,846166053663191040,realDonaldTrump,General Kelly is doing a great job at the bord...,2017-03-27 04:04:42,iphone
2,835814988686233601,realDonaldTrump,"The race for DNC Chairman was, of course, tota...",2017-02-26 13:33:16,android
3,835817351178301440,realDonaldTrump,For first time the failing @nytimes will take ...,2017-02-26 13:42:39,android
4,835916511944523777,realDonaldTrump,"Russia talk is FAKE NEWS put out by the Dems, ...",2017-02-26 20:16:41,android


In [5]:
# Count the occurrences of each class in the 'UserName' column
class_counts = df['user handle'].value_counts()

# Display the counts for each class
print("Counts of each class in 'UserName':")
print(class_counts)

Counts of each class in 'UserName':
user handle
realDonaldTrump    3144
PressSec             11
POTUS                 1
Name: count, dtype: int64


# Text Preprocess

**Our data cleaning process handles a few modifications:**
1. Define regular expressions for different unwanted patterns and normalize them.

2. Constructions are expanded to the full shape of the word and other unwanted chars are removed.

3. Cleans the text to retain only alphanumeric characters and common punctuation.

4. Converts text to lowercase and removes stop words (user's choice).

5. Lemmatization

6. Removal of rows with missing values.

In [6]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define regex patterns
re_url = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
re_html = re.compile(r'<[^<]+?>')
re_date = re.compile(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{2,4}[/-]\d{1,2}[/-]\d{1,2}\b')
re_decimal = re.compile(r'\b\d+\.\d+%?\b')
re_number_percent = re.compile(r'\b\d+%?\b')
re_char = re.compile(r'[^0-9a-zA-Z\s?!.,:\'\"//]+')

# Function for cleaning text
def PreProcessText(text, lower=False, remove_stopwords=True):
    # Replacement mappings for common misencoded characters
    replacements = {
        '“': '"', '”': '"', '‘': "'", '’': "'",
        'â\x80\x9c': '"', 'â\x80\x9d': '"', 'â\x80\x99': "'"
    }

    for bad_char, good_char in replacements.items():
        text = text.replace(bad_char, good_char)

    text = contractions.fix(text)  # Expand contractions
    text = re_html.sub('', text)  # Remove HTML tags
    text = re_url.sub('[URL]', text)  # Replace URLs with placeholder
    text = re_date.sub('[DATE]', text)  # Replace dates with placeholder
    text = re_decimal.sub('[DECIMAL]', text)  # Replace decimal numbers with placeholder
    text = re_number_percent.sub('[NUM]', text)  # Replace whole numbers with placeholder
    text = re_char.sub("", text)  # Remove non-alphanumeric characters

    words = text.split()
    placeholders = ['[URL]', '[DATE]', '[DECIMAL]', '[NUM]']

    if lower:
        words = [word.lower() for word in words if word not in placeholders]  # Convert to lowercase

    if remove_stopwords:
        words = [word for word in words if word not in stop_words]  # Remove stopwords

    words = [lemmatizer.lemmatize(word) if word not in placeholders else word for word in words]  # Lemmatize words

    text = ' '.join(words)  # Join words back into a single string

    return text

In [7]:
df["cleaned text"] = df["tweet text"].progress_apply(PreProcessText)
df.dropna(inplace=True)

100%|██████████| 3156/3156 [00:02<00:00, 1419.64it/s]


In [8]:
df.head()

Unnamed: 0,tweet id,user handle,tweet text,time stamp,device,cleaned text
0,845974102619906048,realDonaldTrump,Democrats are smiling in D.C. that the Freedom...,2017-03-26 15:21:58,iphone,"Democrats smiling D.C. Freedom Caucus, help Cl..."
1,846166053663191040,realDonaldTrump,General Kelly is doing a great job at the bord...,2017-03-27 04:04:42,iphone,General Kelly great job border. Numbers way do...
2,835814988686233601,realDonaldTrump,"The race for DNC Chairman was, of course, tota...",2017-02-26 13:33:16,android,"The race DNC Chairman was, course, totally ""ri..."
3,835817351178301440,realDonaldTrump,For first time the failing @nytimes will take ...,2017-02-26 13:42:39,android,For first time failing nytimes take ad bad one...
4,835916511944523777,realDonaldTrump,"Russia talk is FAKE NEWS put out by the Dems, ...",2017-02-26 20:16:41,android,"Russia talk FAKE NEWS put Dems, played media, ..."


It is known that Trump is using an Android phone, while his staff mostly uses iPhone. We'll treat tweets as Trump by D:evice and Username

In [9]:
def classify_tweet(device, username, timestamp):
    """
    Classifies a tweet as being from Trump or his staff based on the device, username, and timestamp.

    Parameters:
    device (str): The device used to send the tweet.
    username (str): The username who sent the tweet.
    timestamp (str): The timestamp of the tweet.

    Returns:
    int: 1 if the tweet is classified as being from Trump, otherwise 0.
    None: if there is an error in classification.
    """
    try:
        trump_username = 'realDonaldTrump'
        trump_cutoff_date = datetime.strptime('2017-04-01', '%Y-%m-%d')

        tweet_date = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')  # Adjust the format as per your data

        if username.lower() == trump_username.lower() and 'android' in device.lower() and tweet_date < trump_cutoff_date:
            return 0
        elif tweet_date >= trump_cutoff_date: # Can't tell anymore if trump or no
            return None
        else:
            return 1
    except Exception as e:
        # Log the error if needed
        return None

# Apply the classification function to the DataFrame
df['notTrump?'] = df.apply(lambda row: classify_tweet(row['device'], row['user handle'], row['time stamp']), axis=1)
df = df.dropna(subset=['notTrump?'])

# Count the occurrences of each class (Trump or Staff)
class_counts = df['notTrump?'].value_counts()

# Display the counts for each class
print("Counts of each class (Trump or Staff):")
print(class_counts)

Counts of each class (Trump or Staff):
notTrump?
0.0    1991
1.0    1142
Name: count, dtype: int64


In [10]:
# Split the data into training, validation sets
# Model's will be optimized (optuna) on validation set, then assessd using cross validation on the train.
X = df.drop(columns=['notTrump?']).copy()
y = df['notTrump?'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Lets Develop Some Models

## Create a complex features vector

In [11]:
class FeatureVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_types=['tfidf', 'word2vec', 'additional'], country='US', word2vec_size=100, tfidf_max_features=5000):
        self.vector_types = vector_types
        self.country = country
        self.word2vec_size = word2vec_size
        self.tfidf_max_features = tfidf_max_features
        self.additional_features = AdditionalFeatures(country)
        self.word2vec_vectorizer = Word2VecVectorizer(size=word2vec_size)
        self.tfidf_vectorizer = TfidfVectorizer(max_features=tfidf_max_features)

    def fit(self, X, y=None):
        if 'tfidf' in self.vector_types:
            self.tfidf_vectorizer.fit(X['cleaned text'])
        if 'word2vec' in self.vector_types:
            self.word2vec_vectorizer.fit(X)
        if 'additional' in self.vector_types:
            self.additional_features.fit(X)
        return self

    def transform(self, X):
        features = []
        if 'tfidf' in self.vector_types:
            tfidf_features = self.tfidf_vectorizer.transform(X['cleaned text']).toarray()
            features.append(tfidf_features)
        if 'word2vec' in self.vector_types:
            word2vec_features = self.word2vec_vectorizer.transform(X)
            features.append(word2vec_features)
        if 'additional' in self.vector_types:
            additional_features = self.additional_features.transform(X)
            features.append(additional_features)

        # Concatenate all feature vectors
        combined_features = np.hstack(features)
        return combined_features

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

class AdditionalFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, country='US'):
        self.holiday_calendar = holidays.CountryHoliday(country)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([self.extract_features(text, timestamp) for text, timestamp in zip(X['tweet text'], X['time stamp'])])

    def extract_features(self, text, timestamp):
        num_capitalized = sum(1 for c in text if c.isupper())
        num_words = len(text.split())
        num_exclamations = text.count('!')
        num_questions = text.count('?')

        dt = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
        hour_of_day = dt.hour
        day_of_week = dt.weekday()
        month = dt.month
        year = dt.year
        is_holiday = 1 if dt in self.holiday_calendar else 0

        return [num_capitalized, num_words, num_exclamations, num_questions, hour_of_day, day_of_week, month, year, is_holiday]

class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, size=100, min_count=1):
        self.size = size
        self.min_count = min_count
        self.model = None

    def fit(self, X, y=None):
        sentences = [text.split() for text in X['cleaned text']]
        self.model = Word2Vec(sentences, vector_size=self.size, min_count=self.min_count)
        return self

    def transform(self, X):
        return np.array([self.vectorize(text) for text in X['cleaned text']])

    def vectorize(self, text):
        words = text.split()
        word_vecs = [self.model.wv[word] for word in words if word in self.model.wv]
        if len(word_vecs) == 0:
            return np.zeros(self.size)
        return np.mean(word_vecs, axis=0)

## Sci-Kit Learn Model Optimization Flow (Optuna)

In [12]:
# Define the hyperparameters for each model.
hidden_layer_combinations = [
    [10],
    [50],
    [100],
    [200],
    [10, 10],
    [30, 30],
    [50, 50],
    [100, 100],
    [10, 50],
    [50, 100],
    [30, 50],
    [100, 200],
    [10, 10, 10],
    [50, 50, 50],
    [10, 50, 100],
    [50, 100, 200]
]

model_hyperparameters = {
    'LogisticRegression': {
        'C': (1e-4, 1e2, 'loguniform')
    },
    'SVC': {
        'C': (1e-4, 1e2, 'loguniform'),
        'kernel': (['linear', 'rbf'], 'categorical')
    },
    'XGBClassifier': {
        'n_estimators': (50, 200, 'int'),
        'learning_rate': (0.01, 0.2, 'loguniform'),
        'max_depth': (3, 10, 'int'),
        'colsample_bytree': (0.5, 1.0, 'uniform'),
        'subsample': (0.5, 1.0, 'uniform'),
        'reg_alpha': (1e-8, 10.0, 'loguniform'),
        'reg_lambda': (1e-8, 10.0, 'loguniform')
    },
    'FFNN': {
        'hidden_dims': (hidden_layer_combinations, 'categorical')
    }
}

def suggest_hyperparameters(trial, hyperparams):
    params = {}
    for key, value in hyperparams.items():
        if len(value) == 2 and value[1] == 'categorical':
            params[key] = trial.suggest_categorical(key, value[0])
        elif len(value) == 3:
            if value[2] == 'loguniform':
                params[key] = trial.suggest_float(key, value[0], value[1], log=True)
            elif value[2] == 'uniform':
                params[key] = trial.suggest_float(key, value[0], value[1])
            elif value[2] == 'int':
                params[key] = trial.suggest_int(key, value[0], value[1])
            elif value[2] == 'categorical':
                params[key] = trial.suggest_categorical(key, value[0])
            elif value[1] == 'custom':
                hidden_dims = params['hidden_dims']
                layer_count = len(hidden_dims)
                params[key] = trial.suggest_categorical(key, value[0][layer_count])
            else:
                raise ValueError(f"Hyperparameter tuple for {key} is not in the expected format: {value}")
    return params

# Define the objective function for Optuna with cross-validation
def objective(trial, model, model_name, X, y, vector_types=['tfidf']):
    if model_name not in model_hyperparameters:
        raise ValueError(f"No hyperparameters defined for model type: {model_name}")

    hyperparams = model_hyperparameters[model_name]
    params = suggest_hyperparameters(trial, hyperparams)

    model.set_params(**params)

    # Create a pipeline with the custom feature vectorizer and the classifier
    pipeline = Pipeline([
        ('features', FeatureVectorizer(country='US', vector_types=vector_types)),
        ('classifier', model)
    ])

    # Define the cross-validation strategy
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Perform cross-validation and return the mean F1 score
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring=make_scorer(f1_score, average='weighted'))
    return scores.mean()

# Run the optimization with Optuna
def optimize_model_with_optuna(model, model_name, X, y, vector_types=['tfidf'], n_trials=15, timeout=1200):
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model, model_name, X, y, vector_types), n_trials=n_trials, timeout=timeout)

    best_params = study.best_params
    best_value = study.best_value

    print(f"Best hyperparameters for {model_name}: {best_params}, vector {', '.join(vector_types)}")
    print(f"Best F1 score for {model_name}: {best_value}")

    return study.best_params, study.best_value

## Logistic Regression

In [None]:
# Optimize Logistic Regression
model_lr = LogisticRegression(max_iter=1000)
best_params_lr_1, best_score_lr_1 = optimize_model_with_optuna(model_lr, 'LogisticRegression', X_train, y_train, ['tfidf', 'additional'])
best_params_lr_2, best_score_lr_2 = optimize_model_with_optuna(model_lr, 'LogisticRegression', X_train, y_train, ['tfidf'])
if best_score_lr_1 > best_score_lr_2:
    best_params_lr = best_params_lr_1
    best_score_lr = best_score_lr_1
    vector_types = ['tfidf', 'additional']
else:
    best_params_lr = best_params_lr_2
    best_score_lr = best_score_lr_2
    vector_types = ['tfidf']

[I 2024-07-11 11:35:23,622] A new study created in memory with name: no-name-ab0cd678-dc61-4c4b-acce-7675011faa10
[I 2024-07-11 11:35:26,608] Trial 0 finished with value: 0.7141271168525749 and parameters: {'C': 0.014918054384188898}. Best is trial 0 with value: 0.7141271168525749.
[I 2024-07-11 11:35:28,538] Trial 1 finished with value: 0.7003675597052409 and parameters: {'C': 0.0014919711692070578}. Best is trial 0 with value: 0.7141271168525749.
[I 2024-07-11 11:35:29,966] Trial 2 finished with value: 0.6992857203128822 and parameters: {'C': 0.0007312120863647985}. Best is trial 0 with value: 0.7141271168525749.
[I 2024-07-11 11:35:32,245] Trial 3 finished with value: 0.7402865876843854 and parameters: {'C': 0.04603097529864978}. Best is trial 3 with value: 0.7402865876843854.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the

Best hyperparameters for LogisticRegression: {'C': 8.065937821916735}, vector tfidf, additional
Best F1 score for LogisticRegression: 0.8317244411934617


[I 2024-07-11 11:46:03,374] Trial 0 finished with value: 0.4990915867889056 and parameters: {'C': 0.0029613757366319083}. Best is trial 0 with value: 0.4990915867889056.
[I 2024-07-11 11:46:03,895] Trial 1 finished with value: 0.4990915867889056 and parameters: {'C': 0.003642314624327602}. Best is trial 0 with value: 0.4990915867889056.
[I 2024-07-11 11:46:04,443] Trial 2 finished with value: 0.4990915867889056 and parameters: {'C': 0.0004997287677496259}. Best is trial 0 with value: 0.4990915867889056.
[I 2024-07-11 11:46:05,010] Trial 3 finished with value: 0.5117644618628463 and parameters: {'C': 0.013302630915419196}. Best is trial 3 with value: 0.5117644618628463.
[I 2024-07-11 11:46:05,647] Trial 4 finished with value: 0.7449830674161775 and parameters: {'C': 0.1341773841841409}. Best is trial 4 with value: 0.7449830674161775.
[I 2024-07-11 11:46:06,559] Trial 5 finished with value: 0.8309830058846828 and parameters: {'C': 8.933392611537776}. Best is trial 5 with value: 0.8309830

Best hyperparameters for LogisticRegression: {'C': 4.008932259744248}, vector tfidf
Best F1 score for LogisticRegression: 0.8343179447961713


In [None]:
# Train the model with the best parameters
best_model = LogisticRegression(max_iter=1000, **best_params_lr)

# Create a pipeline with the custom feature vectorizer and the classifier
pipeline = Pipeline([
    ('features', FeatureVectorizer(vector_types=vector_types, country='US')),
    ('classifier', best_model)
])

# Fit the pipeline on the entire training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_test_pred = pipeline.predict(X_test)

# Create and print the classification report on test data
print("Test Data Classification Report for Model with Best Parameters:")
print(classification_report(y_test, y_test_pred))

Test Data Classification Report for Model with Best Parameters:
              precision    recall  f1-score   support

         0.0       0.82      0.94      0.87       388
         1.0       0.87      0.66      0.75       239

    accuracy                           0.83       627
   macro avg       0.84      0.80      0.81       627
weighted avg       0.84      0.83      0.83       627



## SVM

In [None]:
# Optimize SVM
model_svm = SVC()
best_params_svm_1, best_score_svm_1 = optimize_model_with_optuna(model_svm, 'SVC', X_train, y_train, ['tfidf', 'additional'])
best_params_svm_2, best_score_svm_2 = optimize_model_with_optuna(model_svm, 'SVC', X_train, y_train, ['tfidf'])
if best_score_svm_1 > best_score_svm_2:
    best_params_svm = best_params_svm_1
    best_score_svm = best_score_svm_1
    vector_types = ['tfidf', 'additional']
else:
    best_params_svm = best_params_svm_2
    best_score_svm = best_score_svm_2
    vector_types = ['tfidf']

In [None]:
# Train the model with the best parameters
best_model = SVC(**best_params_svm)

# Create a pipeline with the custom feature vectorizer and the classifier
pipeline = Pipeline([
    ('features', FeatureVectorizer(vector_types=vector_types, country='US')),
    ('classifier', best_model)
])

# Fit the pipeline on the entire training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_test_pred = pipeline.predict(X_test)

# Create and print the classification report on test data
print("Test Data Classification Report for Model with Best Parameters:")
print(classification_report(y_test, y_test_pred))

Test Data Classification Report for Model with Best Parameters:
              precision    recall  f1-score   support

         0.0       0.83      0.92      0.88       388
         1.0       0.84      0.70      0.77       239

    accuracy                           0.84       627
   macro avg       0.84      0.81      0.82       627
weighted avg       0.84      0.84      0.83       627



## Classifier of Choice: ...

Attempted - XGB

In [None]:
# Optimize XGBoost
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
best_params_xgb_1, best_score_xgb_1 = optimize_model_with_optuna(model_xgb, 'XGBClassifier', X_train, y_train, ['tfidf', 'additional'])
best_params_xgb_2, best_score_xgb_2 = optimize_model_with_optuna(model_xgb, 'XGBClassifier', X_train, y_train, ['tfidf'])
if best_score_xgb_1 > best_score_xgb_2:
    best_params_xgb = best_params_xgb_1
    best_score_xgb = best_score_xgb_1
    vector_types = ['tfidf', 'additional']
else:
    best_params_xgb = best_params_xgb_2
    best_score_xgb = best_score_xgb_2
    vector_types = ['tfidf']

[I 2024-07-11 12:18:30,864] A new study created in memory with name: no-name-c69c992a-ae96-4029-b0f3-39ab2bad11e4
[I 2024-07-11 12:18:42,522] Trial 0 finished with value: 0.8674066147758126 and parameters: {'n_estimators': 86, 'learning_rate': 0.11508223595483232, 'max_depth': 5, 'colsample_bytree': 0.5981914101789696, 'subsample': 0.7541825261540303, 'reg_alpha': 2.060047017838223e-07, 'reg_lambda': 0.16251582819032495}. Best is trial 0 with value: 0.8674066147758126.
[I 2024-07-11 12:19:05,429] Trial 1 finished with value: 0.8644824807121211 and parameters: {'n_estimators': 161, 'learning_rate': 0.017361487320285814, 'max_depth': 6, 'colsample_bytree': 0.8384283940972818, 'subsample': 0.567817466681031, 'reg_alpha': 1.5583141506813328e-07, 'reg_lambda': 0.49781642415914523}. Best is trial 0 with value: 0.8674066147758126.
[I 2024-07-11 12:19:24,495] Trial 2 finished with value: 0.8580543330557975 and parameters: {'n_estimators': 131, 'learning_rate': 0.01344651495309174, 'max_depth':

Best hyperparameters for XGBClassifier: {'n_estimators': 82, 'learning_rate': 0.1877546968777687, 'max_depth': 7, 'colsample_bytree': 0.895288061016391, 'subsample': 0.7014186462764306, 'reg_alpha': 0.51961722082151, 'reg_lambda': 1.1930319472458693e-05}, vector tfidf, additional
Best F1 score for XGBClassifier: 0.8781954423005496


[I 2024-07-11 12:39:08,492] Trial 0 finished with value: 0.8157077567753465 and parameters: {'n_estimators': 184, 'learning_rate': 0.060036867491151655, 'max_depth': 10, 'colsample_bytree': 0.9727555597703041, 'subsample': 0.8563980715248405, 'reg_alpha': 1.1827617546598332e-08, 'reg_lambda': 0.19884823833675283}. Best is trial 0 with value: 0.8157077567753465.
[I 2024-07-11 12:39:23,941] Trial 1 finished with value: 0.8188840655032747 and parameters: {'n_estimators': 178, 'learning_rate': 0.015030455142509216, 'max_depth': 5, 'colsample_bytree': 0.930935669719007, 'subsample': 0.5208033249353339, 'reg_alpha': 2.3982167050534246e-06, 'reg_lambda': 1.012279762514382e-06}. Best is trial 1 with value: 0.8188840655032747.
[I 2024-07-11 12:39:45,370] Trial 2 finished with value: 0.8157114614594801 and parameters: {'n_estimators': 115, 'learning_rate': 0.01799987295598675, 'max_depth': 8, 'colsample_bytree': 0.9717906361078641, 'subsample': 0.6496876336941297, 'reg_alpha': 1.876473885737373e

Best hyperparameters for XGBClassifier: {'n_estimators': 200, 'learning_rate': 0.03608538805877116, 'max_depth': 6, 'colsample_bytree': 0.5913703050959733, 'subsample': 0.717254440879724, 'reg_alpha': 0.27052151861816803, 'reg_lambda': 2.904801246259228e-06}, vector tfidf
Best F1 score for XGBClassifier: 0.8266075934616932


In [None]:
# Train the model with the best parameters
best_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', **best_params_xgb)

# Create a pipeline with the custom feature vectorizer and the classifier
pipeline = Pipeline([
    ('features', FeatureVectorizer(vector_types=vector_types, country='US')),
    ('classifier', best_model)
])

# Fit the pipeline on the entire training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_test_pred = pipeline.predict(X_test)

# Create and print the classification report on test data
print("Test Data Classification Report for Model with Best Parameters:")
print(classification_report(y_test, y_test_pred))

Test Data Classification Report for Model with Best Parameters:
              precision    recall  f1-score   support

         0.0       0.89      0.94      0.92       388
         1.0       0.89      0.82      0.85       239

    accuracy                           0.89       627
   macro avg       0.89      0.88      0.88       627
weighted avg       0.89      0.89      0.89       627



## FFNN

In [15]:
class MultiLayerFFNN(nn.Module):
    '''
    Multilayer FFNN
    '''
    def __init__(self, input_dim, hidden_dims, output_dim=2, dropout=0.5):
        super(MultiLayerFFNN, self).__init__()
        layers = []
        current_dim = input_dim

        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(current_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            current_dim = hidden_dim

        layers.append(nn.Linear(current_dim, output_dim))
        layers.append(nn.LogSoftmax(dim=1))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

    def predict(self, input_data):
        self.eval()  # Set the model to evaluation mode
        with torch.no_grad():  # Disable gradient calculation
            outputs = self(input_data)
            _, predicted = torch.max(outputs, 1)
        return predicted


class PyTorchClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, hidden_dims=None, output_dim=2, dropout=0.3, learning_rate=0.001, print=False):
        self.hidden_dims = hidden_dims if hidden_dims is not None else [50, 50]
        self.output_dim = output_dim
        self.dropout = dropout
        self.learning_rate = learning_rate
        self.num_epochs = len(self.hidden_dims) * 50
        self.model = None
        self.criterion = nn.NLLLoss()
        self.optimizer = None
        self.print = print

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        if 'hidden_dims' in params:
            self.num_epochs = len(self.hidden_dims) * 50
        return self

    def fit(self, X, y):
        input_dim = X.shape[1]
        self.model = MultiLayerFFNN(input_dim, self.hidden_dims, self.output_dim, self.dropout)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

        X_tensor = torch.tensor(X, dtype=torch.float32)
        y_tensor = torch.tensor(y.to_numpy(), dtype=torch.long)

        dataset = TensorDataset(X_tensor, y_tensor)
        train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

        for epoch in range(self.num_epochs):
            self.model.train()
            running_loss = 0.0
            for inputs, labels in train_loader:
                self.optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()
                running_loss += loss.item()
            self.print and print(f"Epoch [{epoch+1}/{self.num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

        return self

    def predict(self, X):
        X_tensor = torch.tensor(X, dtype=torch.float32)
        predictions = self.model.predict(X_tensor)
        return predictions.numpy()

In [16]:
# Example usage
model_ffnn = PyTorchClassifier()
best_params, best_value = optimize_model_with_optuna(model_ffnn, 'FFNN', X_train, y_train, vector_types=['tfidf'], n_trials=15, timeout=120000)
# best_params, best_value = optimize_model_with_optuna(model_ffnn, 'FFNN', X_train, y_train, vector_types=['word2vec'], n_trials=15, timeout=120000)
# best_params, best_value = optimize_model_with_optuna(model_ffnn, 'FFNN', X_train, y_train, vector_types=['tfidf', 'word2vec'], n_trials=15, timeout=120000)
best_params, best_value = optimize_model_with_optuna(model_ffnn, 'FFNN', X_train, y_train, vector_types=['tfidf', 'additional'], n_trials=15, timeout=120000)

[I 2024-07-12 20:01:57,151] A new study created in memory with name: no-name-2f7dc841-47b7-4c1b-a02c-be243f0ba6c6
[I 2024-07-12 20:04:48,733] Trial 0 finished with value: 0.7984285081684002 and parameters: {'hidden_dims': [10, 10]}. Best is trial 0 with value: 0.7984285081684002.
[I 2024-07-12 20:08:24,308] Trial 1 finished with value: 0.8060037490264943 and parameters: {'hidden_dims': [10, 10, 10]}. Best is trial 1 with value: 0.8060037490264943.
[I 2024-07-12 20:12:50,975] Trial 2 finished with value: 0.8020951019984481 and parameters: {'hidden_dims': [50, 100]}. Best is trial 1 with value: 0.8060037490264943.
[I 2024-07-12 20:16:03,294] Trial 3 finished with value: 0.8015068157193855 and parameters: {'hidden_dims': [30, 50]}. Best is trial 1 with value: 0.8060037490264943.
[I 2024-07-12 20:22:34,174] Trial 4 finished with value: 0.8023118234998783 and parameters: {'hidden_dims': [100, 100]}. Best is trial 1 with value: 0.8060037490264943.
[I 2024-07-12 20:25:32,125] Trial 5 finished

Best hyperparameters for FFNN: {'hidden_dims': [10, 50]}, vector tfidf
Best F1 score for FFNN: 0.8125782005038843


[I 2024-07-12 21:02:05,847] Trial 0 finished with value: 0.4990915867889056 and parameters: {'hidden_dims': [10, 10]}. Best is trial 0 with value: 0.4990915867889056.
[I 2024-07-12 21:04:17,944] Trial 1 finished with value: 0.4990915867889056 and parameters: {'hidden_dims': [10, 10]}. Best is trial 0 with value: 0.4990915867889056.
[I 2024-07-12 21:07:57,075] Trial 2 finished with value: 0.4990915867889056 and parameters: {'hidden_dims': [30, 30]}. Best is trial 0 with value: 0.4990915867889056.
[I 2024-07-12 21:15:25,472] Trial 3 finished with value: 0.5600610135610913 and parameters: {'hidden_dims': [50, 50, 50]}. Best is trial 3 with value: 0.5600610135610913.
[I 2024-07-12 21:16:22,318] Trial 4 finished with value: 0.6137110540811268 and parameters: {'hidden_dims': [10]}. Best is trial 4 with value: 0.6137110540811268.
[I 2024-07-12 21:17:22,527] Trial 5 finished with value: 0.6677771817788696 and parameters: {'hidden_dims': [10]}. Best is trial 5 with value: 0.6677771817788696.
[I

Best hyperparameters for FFNN: {'hidden_dims': [50]}, vector tfidf, additional
Best F1 score for FFNN: 0.7520352108897838


In [None]:
# Define hyperparameters
hidden_dims = [50]  # Sizes of the hidden layers
dropout = 0.5
learning_rate = 0.0001  # Reduce learning rate
num_epochs = 50  # Increase number of epochs

# Initialize custom classifier
pytorch_classifier = PyTorchClassifier(hidden_dims=hidden_dims, dropout=dropout, learning_rate=learning_rate, num_epochs=num_epochs)

# Create a pipeline with the custom feature vectorizer and the classifier
pipeline = Pipeline([
    ('features', FeatureVectorizer(vector_types=['tfidf'], country='US')),
    ('classifier', pytorch_classifier)
])

# Fit the pipeline on the entire training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_test_pred = pipeline.predict(X_test)

# Create and print the classification report on test data
print("Test Data Classification Report for Model with Best Parameters:")
print(classification_report(y_test, y_test_pred))

Test Data Classification Report for Model with Best Parameters:
              precision    recall  f1-score   support

         0.0       0.82      0.92      0.86       388
         1.0       0.83      0.67      0.74       239

    accuracy                           0.82       627
   macro avg       0.82      0.79      0.80       627
weighted avg       0.82      0.82      0.82       627



## Classifier of Choice: ...

BERT model.

Note: The model returns "Trump" and "Not Trump" (NOT 0's and 1's)

In [None]:
# Define the path to the model and tokenizer
model_path = "\\BERT\\"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
# Function to predict whether a tweet is by Trump or not
def predict_tweet(tweet):
    inputs = tokenizer(tweet, return_tensors="pt", padding="max_length", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    return "Not Trump" if predictions.item() == 1 else "Trump"

# Test the prediction function
tweet = "This is a sample tweet to test the model."
prediction = predict_tweet(tweet)
print(f"The tweet is predicted to be: {prediction}")
