# Imports

In [1]:
# PyTprch and Transformers installation and import
# These are preinstalled so you will get "Requirement already satisfied" but nevertheless, it is required before you import the relevant packages.
# (not needed for the basic classificaiton models)

! pip install transformers datasets
! pip3 install torch

# Installing additional libraries for text preprocessing
!pip install -q preprocessor
!pip install -q contractions
!pip install -q optuna
!pip install holidays
!pip install gensim

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1

In [8]:
# Basic Python
import pandas as pd
import numpy as np
from datetime import datetime
import holidays
import urllib.request
import csv
import requests
import zipfile
import os

# Google Drive
import gdown

# Text Preprocessing
import re
import preprocessor
import contractions
from tqdm import tqdm
tqdm.pandas()

import nltk, scipy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Feature Vectorization
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

# Optimization
import optuna
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.metrics import f1_score, classification_report, make_scorer

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='optuna.distributions')

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

# FFNN
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Transformer
from scipy.special import softmax
# from transformers import AutoModelForSequenceClassification
# from transformers import TFAutoModelForSequenceClassification
# from transformers import AutoTokenizer
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


**Algorithmic approaches:**



1. [sklearn.linear_model.LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression)
2. [sklearn.svm.SVC](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC) (try both linear and nonlinear kernels!)
3. FFNN classifier - You should use  the PyTorch library to build a FFNN classifier (with at least one hidden layer) to achieve the classification. Feel free to experiment with the number of layers ([a simple tutorial for FFNN with PyTorch](https://medium.com/biaslyai/pytorch-introduction-to-neural-network-feedforward-neural-network-model-e7231cff47cb)).
4. A fourth classifier of choice (neural or not). You are encouraged to experiment with classifiers that allow combining different types of features (e.g. number of capitalized words, time of tweeting, etc.)
5. A fifth classifier of your choice  (this should be neural -  RNN, or transformer-based) - feel free to experiment.



# Support Code

## Text Preprocess

**Our data cleaning process handles a few modifications:**
1. Define regular expressions for different unwanted patterns and normalize them.

2. Constructions are expanded to the full shape of the word and other unwanted chars are removed.

3. Cleans the text to retain only alphanumeric characters and common punctuation.

4. Converts text to lowercase and removes stop words (user's choice).

5. Lemmatization

6. Removal of rows with missing values.

In [3]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define regex patterns
re_url = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
re_html = re.compile(r'<[^<]+?>')
re_date = re.compile(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{2,4}[/-]\d{1,2}[/-]\d{1,2}\b')
re_decimal = re.compile(r'\b\d+\.\d+%?\b')
re_number_percent = re.compile(r'\b\d+%?\b')
re_char = re.compile(r'[^0-9a-zA-Z\s?!.,:\'\"//]+')

# Function for cleaning text
def PreProcessText(text, lower=False, remove_stopwords=True):
    # Replacement mappings for common misencoded characters
    replacements = {
        '“': '"', '”': '"', '‘': "'", '’': "'",
        'â\x80\x9c': '"', 'â\x80\x9d': '"', 'â\x80\x99': "'"
    }

    for bad_char, good_char in replacements.items():
        text = text.replace(bad_char, good_char)

    text = contractions.fix(text)  # Expand contractions
    text = re_html.sub('', text)  # Remove HTML tags
    text = re_url.sub('[URL]', text)  # Replace URLs with placeholder
    text = re_date.sub('[DATE]', text)  # Replace dates with placeholder
    text = re_decimal.sub('[DECIMAL]', text)  # Replace decimal numbers with placeholder
    text = re_number_percent.sub('[NUM]', text)  # Replace whole numbers with placeholder
    text = re_char.sub("", text)  # Remove non-alphanumeric characters

    words = text.split()
    placeholders = ['[URL]', '[DATE]', '[DECIMAL]', '[NUM]']

    if lower:
        words = [word.lower() for word in words if word not in placeholders]  # Convert to lowercase

    if remove_stopwords:
        words = [word for word in words if word not in stop_words]  # Remove stopwords

    words = [lemmatizer.lemmatize(word) if word not in placeholders else word for word in words]  # Lemmatize words

    text = ' '.join(words)  # Join words back into a single string

    return text

def average_timestamp(timestamps, format='%Y-%m-%d %H:%M:%S'):
    valid_timestamps = [datetime.strptime(ts, format) for ts in timestamps if is_valid_timestamp(ts, format)]
    if not valid_timestamps:
        return datetime.now().strftime(format)
    avg_timestamp = sum(map(datetime.timestamp, valid_timestamps)) / len(valid_timestamps)
    return datetime.fromtimestamp(avg_timestamp).strftime(format)


def is_valid_timestamp(timestamp, format='%Y-%m-%d %H:%M:%S'):
    try:
        datetime.strptime(timestamp, format)
        return True
    except ValueError:
        return False

## Add Classification to Train

In [4]:
def classify_tweet(device, username, timestamp):
    """
    Classifies a tweet as being from Trump or his staff based on the device, username, and timestamp.

    Parameters:
    device (str): The device used to send the tweet.
    username (str): The username who sent the tweet.
    timestamp (str): The timestamp of the tweet.

    Returns:
    int: 1 if the tweet is classified as being from Trump, otherwise 0.
    None: if there is an error in classification.
    """
    try:
        trump_username = 'realDonaldTrump'
        trump_cutoff_date = datetime.strptime('2017-04-01', '%Y-%m-%d')

        tweet_date = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')  # Adjust the format as per your data

        if username.lower() == trump_username.lower() and 'android' in device.lower() and tweet_date < trump_cutoff_date:
            return 0
        elif tweet_date >= trump_cutoff_date: # Can't tell anymore if trump or no
            return None
        else:
            return 1
    except Exception as e:
        # Log the error if needed
        return None

## Create a complex features vector

This vector is meant do dinamically feed different feature vector into different models for the optimization process, based on their pre-tested method.

This will allow combination of independent features, TF-IDF and Word2Vec as model input.

In [5]:
class FeatureVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_types=['tfidf', 'word2vec', 'additional'], country='US', word2vec_size=100, tfidf_max_features=5000):
        self.vector_types = vector_types
        self.country = country
        self.word2vec_size = word2vec_size
        self.tfidf_max_features = tfidf_max_features
        self.additional_features = AdditionalFeatures(country)
        self.word2vec_vectorizer = Word2VecVectorizer(size=word2vec_size)
        self.tfidf_vectorizer = TfidfVectorizer(max_features=tfidf_max_features)

    def fit(self, X, y=None):
        if 'tfidf' in self.vector_types:
            self.tfidf_vectorizer.fit(X['cleaned text'])
        if 'word2vec' in self.vector_types:
            self.word2vec_vectorizer.fit(X)
        if 'additional' in self.vector_types:
            self.additional_features.fit(X)
        return self

    def transform(self, X):
        features = []
        if 'tfidf' in self.vector_types:
            tfidf_features = self.tfidf_vectorizer.transform(X['cleaned text']).toarray()
            features.append(tfidf_features)
        if 'word2vec' in self.vector_types:
            word2vec_features = self.word2vec_vectorizer.transform(X)
            features.append(word2vec_features)
        if 'additional' in self.vector_types:
            additional_features = self.additional_features.transform(X)
            features.append(additional_features)

        # Concatenate all feature vectors
        combined_features = np.hstack(features)
        return combined_features

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

class AdditionalFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, country='US'):
        self.holiday_calendar = holidays.CountryHoliday(country)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([self.extract_features(text, timestamp) for text, timestamp in zip(X['tweet text'], X['time stamp'])])

    def extract_features(self, text, timestamp):
        # Initialize all feature values to 0
        num_capitalized = 0
        num_words = 0
        num_exclamations = 0
        num_questions = 0
        hour_of_day = 0
        day_of_week = 0
        month = 0
        year = 0
        is_holiday = 0

        try:
            num_capitalized = sum(1 for c in text if c.isupper())
            num_words = len(text.split())
            num_exclamations = text.count('!')
            num_questions = text.count('?')
        except Exception as e:
            print(f"Error processing text: {text}. Error: {e}")

        try:
            dt = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
            hour_of_day = dt.hour
            day_of_week = dt.weekday()
            month = dt.month
            year = dt.year
            is_holiday = 1 if dt in self.holiday_calendar else 0
        except ValueError as ve:
            print(f"Error processing timestamp: {timestamp}. Error: {ve}")

        return [num_capitalized, num_words, num_exclamations, num_questions, hour_of_day, day_of_week, month, year, is_holiday]

class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, size=100, min_count=1):
        self.size = size
        self.min_count = min_count
        self.model = None

    def fit(self, X, y=None):
        sentences = [text.split() for text in X['cleaned text']]
        self.model = Word2Vec(sentences, vector_size=self.size, min_count=self.min_count)
        return self

    def transform(self, X):
        return np.array([self.vectorize(text) for text in X['cleaned text']])

    def vectorize(self, text):
        words = text.split()
        word_vecs = [self.model.wv[word] for word in words if word in self.model.wv]
        if len(word_vecs) == 0:
            return np.zeros(self.size)
        return np.mean(word_vecs, axis=0)

## Self Developed Models

In [6]:
class MultiLayerFFNN(nn.Module):
    '''
    Multilayer FFNN
    '''
    def __init__(self, input_dim, hidden_dims, output_dim=2, dropout=0.5):
        super(MultiLayerFFNN, self).__init__()
        layers = []
        current_dim = input_dim

        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(current_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            current_dim = hidden_dim

        layers.append(nn.Linear(current_dim, output_dim))
        layers.append(nn.LogSoftmax(dim=1))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

    def predict(self, input_data):
        self.eval()  # Set the model to evaluation mode
        with torch.no_grad():  # Disable gradient calculation
            outputs = self(input_data)
            _, predicted = torch.max(outputs, 1)
        return predicted


class PyTorchClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, hidden_dims=None, output_dim=2, dropout=0.3, learning_rate=0.001, print=False):
        self.hidden_dims = hidden_dims if hidden_dims is not None else [50, 50]
        self.output_dim = output_dim
        self.dropout = dropout
        self.learning_rate = learning_rate
        self.num_epochs = len(self.hidden_dims) * 50
        self.model = None
        self.criterion = nn.NLLLoss()
        self.optimizer = None
        self.print = print

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        if 'hidden_dims' in params:
            self.num_epochs = len(self.hidden_dims) * 50
        return self

    def fit(self, X, y):
        input_dim = X.shape[1]
        self.model = MultiLayerFFNN(input_dim, self.hidden_dims, self.output_dim, self.dropout)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

        X_tensor = torch.tensor(X, dtype=torch.float32)
        y_tensor = torch.tensor(y.to_numpy(), dtype=torch.long)

        dataset = TensorDataset(X_tensor, y_tensor)
        train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

        for epoch in range(self.num_epochs):
            self.model.train()
            running_loss = 0.0
            for inputs, labels in train_loader:
                self.optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()
                running_loss += loss.item()
            self.print and print(f"Epoch [{epoch+1}/{self.num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

        return self

    def predict(self, X):
        X_tensor = torch.tensor(X, dtype=torch.float32)
        predictions = self.model.predict(X_tensor)
        return predictions.numpy()

## Load and Predict with Pre-Trained BERT

In [7]:
def load_model_and_tokenizer(model_url, model_path):
    """
    Load the DistilBERT model and tokenizer from the specified path.

    Args:
      model_url (str): The link for the model itself in google drive
      model_path (str): Path to the directory containing model files.

    Returns:
      model: The loaded DistilBERT model.
      tokenizer: The loaded DistilBERT tokenizer.
    """

    def download_model_files(url, output_path):
      """
      Download and extract model files from a URL.

      Args:
      url (str): URL to the zip file containing model files.
      output_path (str): Path to extract the downloaded zip file.
      """

      # Ensure the output directory exists
      if not os.path.exists(output_path):
          os.makedirs(output_path)

      zip_path = os.path.join(output_path, 'model.zip')

      # Download the zip file
      gdown.download(url, zip_path, quiet=False)

      # Extract the zip file
      with zipfile.ZipFile(zip_path, 'r') as zip_ref:
          zip_ref.extractall(output_path)

      # Remove the zip file
      os.remove(zip_path)

    # Download and extract model files
    download_model_files(model_url, model_path)

    tokenizer = DistilBertTokenizer.from_pretrained(model_path)
    model = DistilBertForSequenceClassification.from_pretrained(model_path)
    return model, tokenizer

In [27]:
def bert_predict(model, tokenizer, df, column_name = 'tweet text', batch_size = 128):
    """
    Predict the labels for the tweet texts in the specified column of the DataFrame in batches.
    Uses the trained model and tokenizer to predict the labels for the given df, based on column name.

    Args:
    model: The DistilBERT model.
    tokenizer: The DistilBERT tokenizer.
    df (pd.DataFrame): The DataFrame containing the tweet texts.
    column_name (str): The name of the column containing tweet texts.
    batch_size (int): The number of samples per batch.

    Returns:
    List[int]: The list of predicted labels.
    """
    predictions = []
    for start in range(0, len(df), batch_size):
        end = min(start + batch_size, len(df))
        batch_texts = df[column_name][start:end].tolist()
        tokens = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")

        # Make Predictions
        with torch.no_grad():
            outputs = model(**tokens)
            batch_predictions = torch.argmax(outputs.logits, dim=1)
            predictions.extend(batch_predictions.numpy().tolist())

    return pd.Series(predictions)

## Pre-Calculated Best Parameters

In [9]:
hyperparameters = {
    1: {
        'model_name': 'LogisticRegression',
        'vector': ['tfidf'],
        'best_hyperparams': {'C': 4.008932259744248}
    },
    2: {
        'model_name': 'SVC',
        'vector': ['tfidf', 'additional'],
        'best_hyperparams': {
            'C': 0.8412136628232837,
            'kernel': 'linear'
        }
    },
    3: {
        'model_name': 'XGBClassifier',
        'vector': ['tfidf', 'additional'],
        'best_hyperparams': {
            'n_estimators': 82,
            'learning_rate': 0.1877546968777687,
            'max_depth': 7,
            'colsample_bytree': 0.895288061016391,
            'subsample': 0.7014186462764306,
            'reg_alpha': 0.51961722082151,
            'reg_lambda': 1.1930319472458693e-05
        }
    },
    4: {
        'model_name': 'FFNN',
        'vector': ['tfidf'],
        'best_hyperparams': {
            'hidden': [50]
        }
    },
    5: {
        'model_name': 'DistilBERT',
        'vector': None,
        'best_hyperparams': None
    }
}

# DistilBERT
model_url = 'https://drive.google.com/uc?export=download&id=1UXjztbH_z-dpLmlNjZuSQyUnNIyUGnqt'
model_path = '/content/distilbert_trump_classifier'

# Initialize Models
models = {
    1: LogisticRegression(max_iter=1000, **hyperparameters[1]['best_hyperparams']),
    2: SVC(**hyperparameters[2]['best_hyperparams']),
    3: XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', **hyperparameters[3]['best_hyperparams']),
    4: PyTorchClassifier(hidden_dims=hyperparameters[4]['best_hyperparams']['hidden']),
    5: load_model_and_tokenizer(model_url, model_path)  # model, tokenizer
}

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1UXjztbH_z-dpLmlNjZuSQyUnNIyUGnqt
From (redirected): https://drive.google.com/uc?export=download&id=1UXjztbH_z-dpLmlNjZuSQyUnNIyUGnqt&confirm=t&uuid=a1e0549c-fc3f-44f6-a945-c8f3fd282587
To: /content/distilbert_trump_classifier/model.zip
100%|██████████| 244M/244M [00:04<00:00, 57.3MB/s]


# API

In [19]:
def training_pipeline(alg, train_fn = 'trump_train.tsv', test = True):
  """Returns a trained model given the specific task and algorithm.
      The pipeline should include all necessary steps that are needed for the
      specified algoritm (preprocessing, normalization, feature extraction - depending
      on your choice and decisions). Obviously, it is advised to implement the pipeline
      through a sequence of function calls.

    Args:

        alg (int): an integer between 1-5, indicating the algorithmic approach as
                    specified above (1: logistic regression, 2: svm, 3:FFNN, etc.).
        train_ fn (str): full path to the file containing the training data.
        test (bool): Train the model on subset of the data and show classification report. If False - train on all data.

    Returns:
      The trained model as a pipeline object
  """
  # Define the column names
  column_names = ['tweet id', 'user handle', 'tweet text', 'time stamp', 'device']

  # Load the TSV file into a DataFrame with specified column names
  df = pd.read_csv(train_fn, sep='\t', names=column_names, header=None)

  # Process the Text
  print('Pre-Process Data')
  df["cleaned text"] = df["tweet text"].progress_apply(PreProcessText)

  avg_timestamp = average_timestamp(df['time stamp'])
  df["time stamp"] = df["time stamp"].apply(lambda ts: ts if is_valid_timestamp(ts) else avg_timestamp)

  # Apply the classification function to the DataFrame
  df['notTrump?'] = df.apply(lambda row: classify_tweet(row['device'], row['user handle'], row['time stamp']), axis=1)
  df.dropna(inplace=True)

  # Split the data into training, validation sets
  # Model's will be optimized (optuna) on validation set, then assessd using cross validation on the train.
  X = df.drop(columns=['notTrump?']).copy()
  y = df['notTrump?'].copy()

  if test:
    X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  print('Fit model to Best Hyperparameters')
  if alg < 5:
    # Create a pipeline with the custom feature vectorizer and the classifier
    model, vector = models[alg], hyperparameters[alg]['vector']
    pipeline = Pipeline([
        ('features', FeatureVectorizer(vector_types=vector, country='US')),
        ('classifier', model)
    ])

    # Fit the pipeline on the entire training data
    pipeline.fit(X, y)
  else:
    # Load the DistilBERT model and tokenizer
    pipeline = models[alg]
    model, tokenizer = pipeline

  if test:
    if alg < 5:
      y_pred = pipeline.predict(X_test)
    else:
      y_pred = bert_predict(model, tokenizer, X_test, column_name='tweet text')

    print(classification_report(y_test, y_pred))

  return pipeline

In [12]:
def retrain_best_model():
  """ Retrains and returns the best performing model for the specified task. The
      function uses the hard coded settings you have found to work best for each
      of the tasks.

      Args:

  """

  # Create a pipeline with the custom feature vectorizer and the classifier
  m = training_pipeline(3, train_fn, test = False)

  return m

In [21]:
def predict(pipeline, fn):
  """ Returns a list of 0s and 1s, corresponding to the lines in the specified file.

    Args:
      pipeline: the trained model and process to be used.
      fn: the full path to a file in the same format as the test set we have proveded.
  """

  #TODO
  print('Pre-Process Data')
  # Define the column names
  column_names = ['user handle', 'tweet text', 'time stamp']

  # Load the TSV file into a DataFrame with specified column names
  data = pd.read_csv(fn, sep='\t', names=column_names, header=None)

  # Process the Text
  data["cleaned text"] = data["tweet text"].progress_apply(PreProcessText)
  avg_timestamp = average_timestamp(data['time stamp'])
  data["time stamp"] = data["time stamp"].apply(lambda ts: ts if is_valid_timestamp(ts) else avg_timestamp)

  print('Predict')
  if isinstance(pipeline, Pipeline): # either a sci-kit pipline object or a trained BERT model
    predictions = pipeline.predict(data)
  else:
    predictions = bert_predict(pipeline[0], pipeline[1], data, column_name='tweet text')

  return predictions.tolist()

In [29]:
def who_am_i():  # this is not a class method
    """Returns a ductionary with your name, id number and email. keys=['name', 'id','email']
        Make sure you return your own info!
    """
    return {'name': ['Shahar Oded', 'Omri Haller'], 'id': ['208388918', '208524413'], 'email': ['odedshah@post.bgu.ac.il', 'haller@post.bgu.ac.il']}

# Playground

In [31]:
# Main
train_fn = '/content/trump_train.tsv'
test_fn = '/content/trump_tweets_test_a.tsv'

model = training_pipeline(3, train_fn, test = True)

Pre-Process Data


100%|██████████| 3156/3156 [00:00<00:00, 9351.80it/s]


Fit model to Best Hyperparameters
              precision    recall  f1-score   support

         0.0       0.87      0.94      0.91       376
         1.0       0.90      0.80      0.85       255

    accuracy                           0.88       631
   macro avg       0.89      0.87      0.88       631
weighted avg       0.88      0.88      0.88       631



In [33]:
model = retrain_best_model()

Pre-Process Data


100%|██████████| 3156/3156 [00:00<00:00, 9247.27it/s]


Fit model to Best Hyperparameters


In [36]:
res = predict(model, test_fn)
' '.join([str(pred) for pred in res])

Pre-Process Data


100%|██████████| 158/158 [00:00<00:00, 4514.64it/s]

Predict





'1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 1 0 1 1 1 1 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0'