# Imports and Installs

In [12]:
# Installing additional libraries for text preprocessing
!pip install -q preprocessor
!pip install -q contractions
!pip install -q optuna
!pip install holidays
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.16.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.16.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.16.0 textstat-0.7.4


In [13]:
# Connect to Google Drive
from google.colab import files, drive
# drive.mount('/content/drive')

# Basic Python
import pandas as pd
import numpy as np
from datetime import datetime
import holidays

# Text Preprocessing
import re
import preprocessor
import contractions
from tqdm import tqdm
tqdm.pandas()

import textstat
import spacy
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Feature Vectorization
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Optimization
import optuna
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.metrics import f1_score, classification_report, make_scorer

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='optuna.distributions')

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Get The Data

In [14]:
# Prompt the user to upload the file
uploaded = files.upload()

# Assuming the file is named 'trump_train.tsv'
file_name = list(uploaded.keys())[0]

# Define the column names
column_names = ['tweet id', 'user handle', 'tweet text', 'time stamp', 'device']

# Load the TSV file into a DataFrame with specified column names
df = pd.read_csv(file_name, sep='\t', names=column_names, header=None)

# Display the first few rows of the DataFrame
print(df.head())

# Display information about the DataFrame
print(df.info())

# Display summary statistics of the DataFrame
print(df.describe())

Saving trump_train (3).tsv to trump_train (3).tsv
             tweet id      user handle  \
0  845974102619906048  realDonaldTrump   
1  846166053663191040  realDonaldTrump   
2  835814988686233601  realDonaldTrump   
3  835817351178301440  realDonaldTrump   
4  835916511944523777  realDonaldTrump   

                                          tweet text           time stamp  \
0  Democrats are smiling in D.C. that the Freedom...  2017-03-26 15:21:58   
1  General Kelly is doing a great job at the bord...  2017-03-27 04:04:42   
2  The race for DNC Chairman was, of course, tota...  2017-02-26 13:33:16   
3  For first time the failing @nytimes will take ...  2017-02-26 13:42:39   
4  Russia talk is FAKE NEWS put out by the Dems, ...  2017-02-26 20:16:41   

    device  
0   iphone  
1   iphone  
2  android  
3  android  
4  android  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3156 entries, 0 to 3155
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ----

In [15]:
df.head()

Unnamed: 0,tweet id,user handle,tweet text,time stamp,device
0,845974102619906048,realDonaldTrump,Democrats are smiling in D.C. that the Freedom...,2017-03-26 15:21:58,iphone
1,846166053663191040,realDonaldTrump,General Kelly is doing a great job at the bord...,2017-03-27 04:04:42,iphone
2,835814988686233601,realDonaldTrump,"The race for DNC Chairman was, of course, tota...",2017-02-26 13:33:16,android
3,835817351178301440,realDonaldTrump,For first time the failing @nytimes will take ...,2017-02-26 13:42:39,android
4,835916511944523777,realDonaldTrump,"Russia talk is FAKE NEWS put out by the Dems, ...",2017-02-26 20:16:41,android


In [16]:
# Count the occurrences of each class in the 'UserName' column
class_counts = df['user handle'].value_counts()

# Display the counts for each class
print("Counts of each class in 'UserName':")
print(class_counts)

Counts of each class in 'UserName':
user handle
realDonaldTrump    3144
PressSec             11
POTUS                 1
Name: count, dtype: int64


# Text Preprocess

**Our data cleaning process handles a few modifications:**
1. Define regular expressions for different unwanted patterns and normalize them.

2. Constructions are expanded to the full shape of the word and other unwanted chars are removed.

3. Cleans the text to retain only alphanumeric characters and common punctuation.

4. Converts text to lowercase and removes stop words (user's choice).

5. Lemmatization

6. Removal of rows with missing values.

In [17]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define regex patterns
re_url = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
re_html = re.compile(r'<[^<]+?>')
re_date = re.compile(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{2,4}[/-]\d{1,2}[/-]\d{1,2}\b')
re_decimal = re.compile(r'\b\d+\.\d+%?\b')
re_number_percent = re.compile(r'\b\d+%?\b')
re_char = re.compile(r'[^0-9a-zA-Z\s?!.,:\'\"//]+')

# Function for cleaning text
def PreProcessText(text, lower=False, remove_stopwords=True):
    # Replacement mappings for common misencoded characters
    replacements = {
        '“': '"', '”': '"', '‘': "'", '’': "'",
        'â\x80\x9c': '"', 'â\x80\x9d': '"', 'â\x80\x99': "'"
    }

    for bad_char, good_char in replacements.items():
        text = text.replace(bad_char, good_char)

    text = contractions.fix(text)  # Expand contractions
    text = re_html.sub('', text)  # Remove HTML tags
    text = re_url.sub('[URL]', text)  # Replace URLs with placeholder
    text = re_date.sub('[DATE]', text)  # Replace dates with placeholder
    text = re_decimal.sub('[DECIMAL]', text)  # Replace decimal numbers with placeholder
    text = re_number_percent.sub('[NUM]', text)  # Replace whole numbers with placeholder
    text = re_char.sub("", text)  # Remove non-alphanumeric characters

    words = text.split()
    placeholders = ['[URL]', '[DATE]', '[DECIMAL]', '[NUM]']

    if lower:
        words = [word.lower() for word in words if word not in placeholders]  # Convert to lowercase

    if remove_stopwords:
        words = [word for word in words if word not in stop_words]  # Remove stopwords

    words = [lemmatizer.lemmatize(word) if word not in placeholders else word for word in words]  # Lemmatize words

    text = ' '.join(words)  # Join words back into a single string

    return text

In [18]:
df["cleaned text"] = df["tweet text"].progress_apply(PreProcessText)
df.dropna(inplace=True)

100%|██████████| 3156/3156 [00:00<00:00, 5164.67it/s]


In [19]:
df.head()

Unnamed: 0,tweet id,user handle,tweet text,time stamp,device,cleaned text
0,845974102619906048,realDonaldTrump,Democrats are smiling in D.C. that the Freedom...,2017-03-26 15:21:58,iphone,"Democrats smiling D.C. Freedom Caucus, help Cl..."
1,846166053663191040,realDonaldTrump,General Kelly is doing a great job at the bord...,2017-03-27 04:04:42,iphone,General Kelly great job border. Numbers way do...
2,835814988686233601,realDonaldTrump,"The race for DNC Chairman was, of course, tota...",2017-02-26 13:33:16,android,"The race DNC Chairman was, course, totally ""ri..."
3,835817351178301440,realDonaldTrump,For first time the failing @nytimes will take ...,2017-02-26 13:42:39,android,For first time failing nytimes take ad bad one...
4,835916511944523777,realDonaldTrump,"Russia talk is FAKE NEWS put out by the Dems, ...",2017-02-26 20:16:41,android,"Russia talk FAKE NEWS put Dems, played media, ..."


It is known that Trump is using an Android phone, while his staff mostly uses iPhone. We'll treat tweets as Trump by D:evice and Username

In [20]:
def classify_tweet(device, username, timestamp):
    """
    Classifies a tweet as being from Trump or his staff based on the device, username, and timestamp.

    Parameters:
    device (str): The device used to send the tweet.
    username (str): The username who sent the tweet.
    timestamp (str): The timestamp of the tweet.

    Returns:
    int: 1 if the tweet is classified as being from Trump, otherwise 0.
    None: if there is an error in classification.
    """
    try:
        trump_username = 'realDonaldTrump'
        trump_cutoff_date = datetime.strptime('2017-04-01', '%Y-%m-%d')

        tweet_date = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')  # Adjust the format as per your data

        if username.lower() == trump_username.lower() and 'android' in device.lower() and tweet_date < trump_cutoff_date:
            return 0
        elif tweet_date >= trump_cutoff_date: # Can't tell anymore if trump or no
            return None
        else:
            return 1
    except Exception as e:
        # Log the error if needed
        return None

# Apply the classification function to the DataFrame
df['notTrump?'] = df.apply(lambda row: classify_tweet(row['device'], row['user handle'], row['time stamp']), axis=1)
df = df.dropna(subset=['notTrump?'])

# Count the occurrences of each class (Trump or Staff)
class_counts = df['notTrump?'].value_counts()

# Display the counts for each class
print("Counts of each class (Trump or Staff):")
print(class_counts)

Counts of each class (Trump or Staff):
notTrump?
0.0    1991
1.0    1142
Name: count, dtype: int64


In [30]:
# Split the data into training, validation sets
# Model's will be optimized (optuna) on validation set, then assessd using cross validation on the train.
X = df.drop(columns=['notTrump?']).copy()
y = df['notTrump?'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Lets Develop Some Models

## Create a complex features vector

Use TF-IDF vector combined with additional calculated features for the task.

Finally, reduce feature vector by applying feature selection.

In [24]:
class FeatureSelectionWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, estimator=RandomForestClassifier(), threshold=None):
        self.selector = SelectFromModel(estimator=estimator, threshold=threshold)

    def fit(self, X, y=None):
        self.selector.fit(X, y)
        return self

    def transform(self, X):
        return self.selector.transform(X)

    def fit_transform(self, X, y=None):
        return self.selector.fit_transform(X, y)


class FeatureVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_types=['tfidf', 'additional'], country='US', tfidf_max_features=5000):
        self.vector_types = vector_types
        self.country = country
        self.tfidf_max_features = tfidf_max_features
        self.additional_features = AdditionalFeatures(country)
        self.tfidf_vectorizer = TfidfVectorizer(max_features=tfidf_max_features)

    def fit(self, X, y=None):
        if 'tfidf' in self.vector_types:
            self.tfidf_vectorizer.fit(X['cleaned text'])
        if 'additional' in self.vector_types:
            self.additional_features.fit(X)
        return self

    def transform(self, X):
        features = []
        if 'tfidf' in self.vector_types:
            tfidf_features = self.tfidf_vectorizer.transform(X['cleaned text']).toarray()
            features.append(tfidf_features)
        if 'additional' in self.vector_types:
            additional_features = self.additional_features.transform(X)
            features.append(additional_features)

        # Concatenate all feature vectors
        combined_features = np.hstack(features)
        return combined_features

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)


class AdditionalFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, country='US'):
        self.holiday_calendar = holidays.CountryHoliday(country)
        self.nlp = spacy.load('en_core_web_sm')

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([self.extract_features(text, timestamp) for text, timestamp in zip(X['tweet text'], X['time stamp'])])

    def extract_features(self, text, timestamp):
        words = [word for word in re.findall(r'\b\w+\b', text)]
        num_words = len(words)
        if num_words == 0:
            num_words = 1  # To avoid division by zero

        num_capitalized = sum(1 for c in text if c.isupper()) / num_words
        num_exclamations = text.count('!') / num_words
        num_questions = text.count('?') / num_words
        avg_word_length = sum(len(word) for word in words) / num_words

        dt = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
        hour_of_day = dt.hour
        day_of_week = dt.weekday()
        month = dt.month
        year = dt.year
        is_holiday = 1 if dt in self.holiday_calendar else 0

        # NER and POS features
        doc = self.nlp(text)
        num_entities = len(doc.ents) / num_words
        num_nouns = sum(1 for token in doc if token.pos_ == 'NOUN') / num_words
        num_verbs = sum(1 for token in doc if token.pos_ == 'VERB') / num_words
        num_adjectives = sum(1 for token in doc if token.pos_ == 'ADJ') / num_words
        num_adverbs = sum(1 for token in doc if token.pos_ == 'ADV') / num_words

        # Sentiment and subjectivity
        blob = TextBlob(text)
        sentiment = blob.sentiment.polarity
        subjectivity = blob.sentiment.subjectivity

        # Readability score (Flesch-Kincaid)
        readability = textstat.flesch_kincaid_grade(text)

        return [num_capitalized, num_words, num_exclamations, num_questions, avg_word_length, hour_of_day, day_of_week, month, year, is_holiday, num_entities, num_nouns, num_verbs, num_adjectives, num_adverbs, sentiment, subjectivity, readability]

## Feature Selection

Features are selected externally, once, and selected features will be used for all models. We also scale the features before passing them to any model.

In [31]:
# Feature selection and scaling performed externally
def preselect_and_scale_features(X_train, X_test, y_train, vector_types=['tfidf', 'additional']):
    # Step 1: Feature Extraction & Selection
    tqdm.write("Step 1/3: Feature Extraction & Selection")
    selection_pipeline = Pipeline([
        ('features', FeatureVectorizer(country='US', vector_types=vector_types)),
        ('feature_selection', FeatureSelectionWrapper(estimator=RandomForestClassifier(n_estimators=100)))
    ])

    X_train_selected = selection_pipeline.fit_transform(X_train, y_train)
    X_test_selected = selection_pipeline.transform(X_test)

    tqdm.write("Feature Extraction & Selection complete.")

    # Step 2: Scaling Train Features
    tqdm.write("Step 2/3: Scaling Train Features")
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    tqdm.write("Scaling Train Features complete.")

    # Step 3: Scaling Test Features
    tqdm.write("Step 3/3: Scaling Test Features")
    X_test_scaled = scaler.transform(X_test_selected)
    tqdm.write("Scaling Test Features complete.")

    return X_train_scaled, X_test_scaled

In [32]:
X_train, X_test = preselect_and_scale_features(X_train, X_test, y_train, vector_types=['tfidf', 'additional'])

Step 1/3: Feature Extraction & Selection
Feature Extraction & Selection complete.
Step 2/3: Scaling Train Features
Scaling Train Features complete.
Step 3/3: Scaling Test Features
Scaling Test Features complete.


## Sci-Kit Learn Model Optimization Flow (Optuna)

In [61]:
# Define the hyperparameters for each model.
hidden_layer_combinations = [
    [10],
    [50],
    [100],
    [200],
    [10, 10],
    [30, 30],
    [50, 50],
    [100, 100],
    [10, 50],
    [50, 100],
    [30, 50],
    [100, 200],
    [10, 10, 10],
    [50, 50, 50],
    [10, 50, 100],
    [50, 100, 200]
]

model_hyperparameters = {
    'LogisticRegression': {
        'C': (1e-4, 1e2, 'loguniform'),
        'penalty': (['l2'], 'categorical'),
        'fit_intercept': ([True, False], 'categorical'),
    },
    'SVC': {
        'C': (1e-4, 1e2, 'loguniform'),
        'kernel': (['linear', 'poly', 'rbf', 'sigmoid'], 'categorical'),
        'degree': (2, 5, 'int'),  # Only relevant for 'poly' kernel
        'gamma': (['scale', 'auto'], 'categorical'),  # Restrict gamma to valid string options only
        'coef0': (-1, 1, 'uniform'),  # Used for 'poly' and 'sigmoid'
    },
    'XGBClassifier': {
        'n_estimators': (50, 200, 'int'),
        'learning_rate': (0.01, 0.2, 'loguniform'),
        'max_depth': (3, 10, 'int'),
        'colsample_bytree': (0.5, 1.0, 'uniform'),
        'subsample': (0.5, 1.0, 'uniform'),
        'reg_alpha': (1e-8, 10.0, 'loguniform'),
        'reg_lambda': (1e-8, 10.0, 'loguniform')
    },
    'FFNN': {
        'hidden_dims': (hidden_layer_combinations, 'categorical')
    }
}


def suggest_hyperparameters(trial, hyperparams):
    params = {}
    for key, value in hyperparams.items():
        if len(value) == 2 and value[1] == 'categorical':
            params[key] = trial.suggest_categorical(key, value[0])
        elif len(value) == 3:
            if value[2] == 'loguniform':
                params[key] = trial.suggest_float(key, value[0], value[1], log=True)
            elif value[2] == 'uniform':
                params[key] = trial.suggest_float(key, value[0], value[1])
            elif value[2] == 'int':
                params[key] = trial.suggest_int(key, value[0], value[1])
            elif value[2] == 'categorical':
                params[key] = trial.suggest_categorical(key, value[0])
            elif value[1] == 'custom':
                hidden_dims = params['hidden_dims']
                layer_count = len(hidden_dims)
                params[key] = trial.suggest_categorical(key, value[0][layer_count])
            else:
                raise ValueError(f"Hyperparameter tuple for {key} is not in the expected format: {value}")
    return params


# Define the objective function for Optuna with cross-validation
def objective(trial, model, model_name, X_scaled, y):
    if model_name not in model_hyperparameters:
        raise ValueError(f"No hyperparameters defined for model type: {model_name}")

    hyperparams = model_hyperparameters[model_name]
    params = suggest_hyperparameters(trial, hyperparams)

    model.set_params(**params)

    # Create a pipeline with just the classifier since feature prep is external
    pipeline = Pipeline([
        ('classifier', model)
    ])

    # Define the cross-validation strategy
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Perform cross-validation and return the mean F1 score
    scores = cross_val_score(pipeline, X_scaled, y, cv=cv, scoring=make_scorer(f1_score, average='weighted'))
    return scores.mean()


# Run the optimization with Optuna
def optimize_model_with_optuna(model, model_name, X, y, n_trials=50, timeout=1200):
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model, model_name, X, y), n_trials=n_trials, timeout=timeout)

    best_params = study.best_params
    best_value = study.best_value

    print(f"Best hyperparameters for {model_name}: {best_params}")
    print(f"Best F1 score for {model_name}: {best_value}")

    return study.best_params, study.best_value

## Logistic Regression

In [51]:
# Optimize Logistic Regression
model_lr = LogisticRegression(max_iter=1000)
best_params_lr, best_score_lr = best_params_lr, best_score_lr = optimize_model_with_optuna(model_lr, 'LogisticRegression', X_train, y_train)
vector_types = ['tfidf', 'additional']

[I 2024-08-23 15:36:14,280] A new study created in memory with name: no-name-0238f2b0-bfce-4c60-80b7-3e1c5cbe7c28
[I 2024-08-23 15:36:15,995] Trial 0 finished with value: 0.8322991899631658 and parameters: {'C': 2.0469181802413963, 'penalty': 'l2', 'fit_intercept': True}. Best is trial 0 with value: 0.8322991899631658.
[I 2024-08-23 15:36:19,351] Trial 1 finished with value: 0.811885648081965 and parameters: {'C': 14.504643711301425, 'penalty': 'l2', 'fit_intercept': False}. Best is trial 0 with value: 0.8322991899631658.
[I 2024-08-23 15:36:19,792] Trial 2 finished with value: 0.4990915867889056 and parameters: {'C': 0.00010567354132777222, 'penalty': 'l2', 'fit_intercept': False}. Best is trial 0 with value: 0.8322991899631658.
[I 2024-08-23 15:36:23,592] Trial 3 finished with value: 0.8045183325845902 and parameters: {'C': 43.73815135934837, 'penalty': 'l2', 'fit_intercept': True}. Best is trial 0 with value: 0.8322991899631658.
[I 2024-08-23 15:36:23,924] Trial 4 finished with valu

Best hyperparameters for LogisticRegression: {'C': 2.0469181802413963, 'penalty': 'l2', 'fit_intercept': True}
Best F1 score for LogisticRegression: 0.8322991899631658


In [52]:
# Train the model with the best parameters
best_model = LogisticRegression(max_iter=1000, **best_params_lr)

# Create a pipeline with the custom feature vectorizer and the classifier
pipeline = Pipeline([
    ('classifier', best_model)
])

# Fit the pipeline on the entire training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_test_pred = pipeline.predict(X_test)

# Create and print the classification report on test data
print("Test Data Classification Report for Model with Best Parameters:")
print(classification_report(y_test, y_test_pred))

Test Data Classification Report for Model with Best Parameters:
              precision    recall  f1-score   support

         0.0       0.81      0.92      0.86       388
         1.0       0.84      0.65      0.73       239

    accuracy                           0.82       627
   macro avg       0.83      0.79      0.80       627
weighted avg       0.82      0.82      0.81       627



## SVM

In [62]:
# Optimize SVM
model_svm = SVC()
best_params_svm, best_score_svm = optimize_model_with_optuna(model_svm, 'SVC', X_train, y_train)
vector_types = ['tfidf', 'additional']

[I 2024-08-23 15:44:48,408] A new study created in memory with name: no-name-1ccac1f0-81af-4644-b636-5f1969141016
[I 2024-08-23 15:44:56,340] Trial 0 finished with value: 0.8329888060647355 and parameters: {'C': 0.9893533407434953, 'kernel': 'rbf', 'degree': 4, 'gamma': 'scale', 'coef0': 0.09224682293858533}. Best is trial 0 with value: 0.8329888060647355.
[I 2024-08-23 15:45:01,855] Trial 1 finished with value: 0.7893638365029096 and parameters: {'C': 26.023189414174396, 'kernel': 'linear', 'degree': 5, 'gamma': 'scale', 'coef0': -0.2473431351978117}. Best is trial 0 with value: 0.8329888060647355.
[I 2024-08-23 15:45:05,922] Trial 2 finished with value: 0.7090327482658811 and parameters: {'C': 8.993491342271039, 'kernel': 'sigmoid', 'degree': 2, 'gamma': 'scale', 'coef0': 0.8686654648820518}. Best is trial 0 with value: 0.8329888060647355.
[I 2024-08-23 15:45:10,948] Trial 3 finished with value: 0.8158324744075796 and parameters: {'C': 6.53072771148751, 'kernel': 'linear', 'degree': 

Best hyperparameters for SVC: {'C': 0.12496306212174019, 'kernel': 'poly', 'degree': 4, 'gamma': 'scale', 'coef0': 0.9548990580346346}
Best F1 score for SVC: 0.8406908564616211


In [63]:
# Train the model with the best parameters
best_model = SVC(**best_params_svm)

# Create a pipeline with the custom feature vectorizer and the classifier
pipeline = Pipeline([
    ('classifier', best_model)
])

# Fit the pipeline on the entire training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_test_pred = pipeline.predict(X_test)

# Create and print the classification report on test data
print("Test Data Classification Report for Model with Best Parameters:")
print(classification_report(y_test, y_test_pred))

Test Data Classification Report for Model with Best Parameters:
              precision    recall  f1-score   support

         0.0       0.81      0.96      0.88       388
         1.0       0.90      0.64      0.75       239

    accuracy                           0.84       627
   macro avg       0.86      0.80      0.81       627
weighted avg       0.85      0.84      0.83       627



## Classifier of Choice: ...

Attempted - XGB

In [65]:
# Optimize XGBoost
model_xgb = XGBClassifier(eval_metric='mlogloss')
best_params_xgb, best_score_xgb = optimize_model_with_optuna(model_xgb, 'XGBClassifier', X_train, y_train)
vector_types = ['tfidf', 'additional']

[I 2024-08-23 15:53:51,150] A new study created in memory with name: no-name-a78faced-39a8-47f0-878a-14a8af3e50d4
[I 2024-08-23 15:54:10,622] Trial 0 finished with value: 0.8767618275102205 and parameters: {'n_estimators': 162, 'learning_rate': 0.05700049424292851, 'max_depth': 5, 'colsample_bytree': 0.8768005359225837, 'subsample': 0.838865296692839, 'reg_alpha': 1.4736066697527352, 'reg_lambda': 3.940140704054706e-08}. Best is trial 0 with value: 0.8767618275102205.
[I 2024-08-23 15:54:14,896] Trial 1 finished with value: 0.8651521404069074 and parameters: {'n_estimators': 54, 'learning_rate': 0.05025637543006053, 'max_depth': 5, 'colsample_bytree': 0.8513869609433966, 'subsample': 0.8760904582774528, 'reg_alpha': 7.31622372240061e-08, 'reg_lambda': 1.5850410025795e-08}. Best is trial 0 with value: 0.8767618275102205.
[I 2024-08-23 15:54:24,732] Trial 2 finished with value: 0.8338250940491735 and parameters: {'n_estimators': 81, 'learning_rate': 0.012037345820552752, 'max_depth': 9, 

Best hyperparameters for XGBClassifier: {'n_estimators': 162, 'learning_rate': 0.05700049424292851, 'max_depth': 5, 'colsample_bytree': 0.8768005359225837, 'subsample': 0.838865296692839, 'reg_alpha': 1.4736066697527352, 'reg_lambda': 3.940140704054706e-08}
Best F1 score for XGBClassifier: 0.8767618275102205


In [66]:
# Train the model with the best parameters
best_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', **best_params_xgb)

# Create a pipeline with the custom feature vectorizer and the classifier
pipeline = Pipeline([
    ('classifier', best_model)
])

# Fit the pipeline on the entire training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_test_pred = pipeline.predict(X_test)

# Create and print the classification report on test data
print("Test Data Classification Report for Model with Best Parameters:")
print(classification_report(y_test, y_test_pred))

Parameters: { "use_label_encoder" } are not used.



Test Data Classification Report for Model with Best Parameters:
              precision    recall  f1-score   support

         0.0       0.87      0.95      0.91       388
         1.0       0.90      0.77      0.83       239

    accuracy                           0.88       627
   macro avg       0.89      0.86      0.87       627
weighted avg       0.88      0.88      0.88       627



## FFNN

In [69]:
class MultiLayerFFNN(nn.Module):
    '''
    Multilayer FFNN
    '''
    def __init__(self, input_dim, hidden_dims, output_dim=2, dropout=0.5):
        super(MultiLayerFFNN, self).__init__()
        layers = []
        current_dim = input_dim

        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(current_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            current_dim = hidden_dim

        layers.append(nn.Linear(current_dim, output_dim))
        layers.append(nn.LogSoftmax(dim=1))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

    def predict(self, input_data):
        self.eval()  # Set the model to evaluation mode
        with torch.no_grad():  # Disable gradient calculation
            outputs = self(input_data)
            _, predicted = torch.max(outputs, 1)
        return predicted


class PyTorchClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, hidden_dims=None, output_dim=2, dropout=0.3, learning_rate=0.001, print=False):
        self.hidden_dims = hidden_dims if hidden_dims is not None else [50, 50]
        self.output_dim = output_dim
        self.dropout = dropout
        self.learning_rate = learning_rate
        self.num_epochs = len(self.hidden_dims) * 50
        self.model = None
        self.criterion = nn.NLLLoss()
        self.optimizer = None
        self.print = print
        self.classes_ = None  # To store unique class labels

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        if 'hidden_dims' in params:
            self.num_epochs = len(self.hidden_dims) * 50
        return self

    def fit(self, X, y):
        input_dim = X.shape[1]
        self.model = MultiLayerFFNN(input_dim, self.hidden_dims, self.output_dim, self.dropout)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

        X_tensor = torch.tensor(X, dtype=torch.float32)
        y_tensor = torch.tensor(y.to_numpy(), dtype=torch.long)

        self.classes_ = np.unique(y)  # Set classes_ attribute

        dataset = TensorDataset(X_tensor, y_tensor)
        train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

        for epoch in range(self.num_epochs):
            self.model.train()
            running_loss = 0.0
            for inputs, labels in train_loader:
                self.optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()
                running_loss += loss.item()
            self.print and print(f"Epoch [{epoch+1}/{self.num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

        return self

    def predict(self, X):
        X_tensor = torch.tensor(X, dtype=torch.float32)
        predictions = self.model.predict(X_tensor)
        return predictions.numpy()

    def predict_proba(self, X):
        self.model.eval()
        X_tensor = torch.tensor(X, dtype=torch.float32)
        with torch.no_grad():
            outputs = self.model(X_tensor)
            probabilities = torch.exp(outputs)  # Convert log-softmax back to probabilities
        return probabilities.numpy()

In [70]:
# Example usage
model_ffnn = PyTorchClassifier()
best_params, best_value = optimize_model_with_optuna(model_ffnn, 'FFNN', X_train, y_train, n_trials=15, timeout=120000)

[I 2024-08-23 16:10:40,379] A new study created in memory with name: no-name-c46bf5f9-4826-4a66-bd38-541accd0742d
[I 2024-08-23 16:12:16,635] Trial 0 finished with value: 0.8082688599919416 and parameters: {'hidden_dims': [100, 100]}. Best is trial 0 with value: 0.8082688599919416.
[I 2024-08-23 16:13:52,098] Trial 1 finished with value: 0.8026909643546922 and parameters: {'hidden_dims': [100, 200]}. Best is trial 0 with value: 0.8082688599919416.
[I 2024-08-23 16:14:22,079] Trial 2 finished with value: 0.7992354477003201 and parameters: {'hidden_dims': [50]}. Best is trial 0 with value: 0.8082688599919416.
[I 2024-08-23 16:15:25,522] Trial 3 finished with value: 0.8033062105069539 and parameters: {'hidden_dims': [10, 10]}. Best is trial 0 with value: 0.8082688599919416.
[I 2024-08-23 16:17:05,148] Trial 4 finished with value: 0.80715930529548 and parameters: {'hidden_dims': [100, 200]}. Best is trial 0 with value: 0.8082688599919416.
[I 2024-08-23 16:17:32,371] Trial 5 finished with v

Best hyperparameters for FFNN: {'hidden_dims': [10, 50, 100]}
Best F1 score for FFNN: 0.812291448077214


In [72]:
# Define hyperparameters
hidden_dims = [10, 50, 100]  # Sizes of the hidden layers
dropout = 0.3
learning_rate = 0.001  # Reduce learning rate

# Initialize custom classifier
pytorch_classifier = PyTorchClassifier(hidden_dims=hidden_dims, dropout=dropout, learning_rate=learning_rate)

# Create a pipeline with the custom feature vectorizer and the classifier
pipeline = Pipeline([
    ('classifier', pytorch_classifier)
])

# Fit the pipeline on the entire training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_test_pred = pipeline.predict(X_test)

# Create and print the classification report on test data
print("Test Data Classification Report for Model with Best Parameters:")
print(classification_report(y_test, y_test_pred))

Test Data Classification Report for Model with Best Parameters:
              precision    recall  f1-score   support

         0.0       0.81      0.89      0.85       388
         1.0       0.79      0.66      0.72       239

    accuracy                           0.80       627
   macro avg       0.80      0.78      0.78       627
weighted avg       0.80      0.80      0.80       627



## Classifier of Choice: ...

BERT model.

Note: The model returns "Trump" and "Not Trump" (NOT 0's and 1's)

In [None]:
# Define the path to the model and tokenizer
model_path = "\\BERT\\"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
# Function to predict whether a tweet is by Trump or not
def predict_tweet(tweet):
    inputs = tokenizer(tweet, return_tensors="pt", padding="max_length", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    return "Not Trump" if predictions.item() == 1 else "Trump"

# Test the prediction function
tweet = "This is a sample tweet to test the model."
prediction = predict_tweet(tweet)
print(f"The tweet is predicted to be: {prediction}")
