In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/500k-chatgpt-tweets-jan-mar-2023/Twitter Jan Mar.csv
/kaggle/input/edgetier-takehome/labled_1.csv
/kaggle/input/edgetier-takehome/active_learning_labled - reinforced_learning.csv
/kaggle/input/edgetier-takehome/labled_2.csv
/kaggle/input/edgetier-takehome/labled_3.csv
/kaggle/input/edgetier-takehome/df_higher_quality.csv
/kaggle/input/edgetier-takehome/bert_zeroshot (1).csv
/kaggle/input/edgetier-takehome/tweets_clean.npy
/kaggle/input/edgetier-takehome/bert_zero_shot_non-technical.csv
/kaggle/input/edgetier-takehome/multinomial_nb_model.pkl


## AIM: Build a Binary Classifier Model to Detect Technical Content in Tweets.

### Action Plan
Identifying technical content within a vast dataset of 500k tweets posed challenges. Traditional methods such as keyword and hashtag analysis yielded only 1% relevant tweets. We refined the dataset to 200k tweets and employed BERTopic to identify potential technical tweets, resulting in 440 candidates. These were manually reviewed to ensure accuracy, leading to a balanced dataset of 800 tweets with a 50/50 split between technical and non-technical content. We then built a Naive Bayes classifier, continually improving it by incorporating human-labeled tweets with low model confidence for continuous learning.

* **Step 1: Identifying Potentially Technical Tweets with BERT Zero-Shot Classification**  
    - Filtered out tweets with excessive links or mentions, resulting in a refined dataset of 200,000 tweets.
    - Employed BERTopic (zero-shot classification) with an 85% probability threshold, identifying 440 candidate technical tweets for manual verification.

* **Step 2: Creating a Training/Test Set for Naive Bayes Classifier from Labeled BERT Data**  
    - Manually reviewed candidate tweets to correct any misclassifications.
    - Constructed a balanced dataset of 800 tweets with equal representation of technical and non-technical content.

* **Step 3: Training Initial Model on Labeled Data from BERT & Manual Review**  
    - Trained a Naïve Bayes classifier for efficient and interpretable results, reserving 15% of labeled tweets for testing.

* **Step 4: Developing Functions for Active Learning**  
    - Built three functions to facilitate the active learning process, ensuring we did not sample from data previously used to train the Naive Bayes Model.

* **Step 5: Active Learning Iterations**  
    - Reviewed and labeled tweets with low confidence scores (55%-75%), incorporating newly categorized data back into the model for continuous improvement.

* **Step 6: Saving the Model to Hugging Face & Testing on Tweets**  
    - Saved the model to Hugging Face and tested it on sample tweets.

In [3]:
!pip install bertopic
!pip install joblib

import nltk
from collections import Counter
import random
import re
from textblob import TextBlob
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

import joblib

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-2.6.1-py3-none-any.whl.metadata (11 kB)
Collecting cython<3,>=0.27 (from hdbscan>=0.8.29->bertopic)
  Using cached Cython-0.29.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (3.1 kB)
Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

2024-03-27 05:52:30.795521: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-27 05:52:30.795751: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-27 05:52:30.977142: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
# Read the CSV file containing tweet data
df = pd.read_csv('/kaggle/input/500k-chatgpt-tweets-jan-mar-2023/Twitter Jan Mar.csv')

# Define a set of keywords that might indicate technical content
simple_keyword_set = ['login', 'timeout', 'authentication', 'server']

# Function to clean tweet text: lowercase, remove URLs, hashtags, and special characters
def clean_tweet(text):
  text = text.lower()
  text = re.sub(r'http\S+', '', text)  # Remove URLs
  text = re.sub(r'[@#]', '', text)     # Remove mentions and hashtags
  text = re.sub(r"(?P<url>http[s]?:\/\/[^\s]+)", "", text)  # Remove remaining URLs with more complex patterns
  return text

# Function to check if a tweet contains a link
def contains_link(text):
  url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'  # Regular expression for URL detection
  if re.search(url_pattern, text):
    return 1  # Tweet contains a link
  else:
    return 0  # Tweet does not contain a link

# Function to search for simple keywords in the tweet text
def simple_keyword_search(text):
  for word in simple_keyword_set:
    if word in text:
      return 1  # Keyword found in the text
      break  # Exit the loop if a keyword is found
  else:
    return 0  # No keywords found

# Function to perform stemming on text (reduce words to their base form)
def stem_text(text):
  stemmer = PorterStemmer()  # Import PorterStemmer class from nltk (assuming it's installed)
  stemmed_text = stemmer.stem(text)
  return stemmed_text

# Preprocess the tweet content column
df['content'] = df['content'].astype(str).str.lower()  # Ensure content is string and lowercase
df['tweet_cleaned'] = df['content'].apply(clean_tweet)
df['stem'] = df['tweet_cleaned'].apply(stem_text)  # Apply stemming (optional)

# Add new features to the dataframe based on the content
df['contains_link'] = df['content'].apply(contains_link)
df['tweet_length'] = df['content'].str.len()  # Character length of the tweet
df['num_hashtags'] = df['content'].str.split('#').str.len() - 1  # Number of hashtags (excluding the first occurrence)
df['num_mentions'] = df['content'].str.split('@').str.len() - 1  # Number of mentions (excluding the first occurrence)

# Search for simple keywords in the cleaned tweet text
df['contains_keyword'] = df['tweet_cleaned'].apply(simple_keyword_search)

# Calculate percentage and total count of tweets containing keywords
percent_contains_keyword = (df['contains_keyword'].sum() / len(df['contains_keyword'])) * 100
total_num_containing = df['contains_keyword'].sum()

# Print informative message about the analysis results
print(f"""
A Total of {total_num_containing} Tweets contain keywords that May indicate the content is Technical in Nature 
This accounts for {percent_contains_keyword:.2f}% of Total Tweets.
""")


A Total of 1987 Tweets contain keywords that May indicate the content is Technical in Nature 
This accounts for 0.40% of Total Tweets.



## Step 1: Identifying Potentially Technical Tweets with BERT Zero-Shot Classification

Our Dataset contains a lot of Tweets that are pretty spammy in nature & won't be super useful for our Model. I want to initially filter out Tweets that May be spammy in Nature (Containing links, or lots of Tags/Mentions) and use this higher Quality Dataframe with BERTopic's ZeroShot classification ot find Tweets that BERT model believes are Technical in nature:

* **Strategy:**
    1. **Filter High-Quality Tweets:** 
        * Create a subset of tweets focusing on potentially more informative content:
            * Exclude tweets containing links (potentially less textual information).
            * Select tweets with a minimum length of 20 characters (potentially more substance).
            * Limit tweets with excessive hashtags or mentions (potentially less focused content).
    2. **Feature Engineering:**
        * Extract two text representations for each tweet within the high-quality subset:
            * **Basic Processing:** Lowercase text, apply stemming, remove stop words, and tokenize.
            * **Advanced Processing:** Perform all steps from basic processing, additionally removing hashtags and mentions while preserving the overall sentence structure.
    3. **BERT Zero-Shot Classification:**
        * Utilize these text representations as input for BERT's Zero-Shot classification. 
        * This approach leverages cosine similarity to categorize text snippets based on predefined labels (e.g., "technical", "non-technical").
        * The goal is to identify the text representation (basic vs. advanced processing) that yields the most accurate classification for technical tweets.

By focusing on high-quality tweets and exploring different text representations, we aim to improve the accuracy of identifying potentially technical tweets using BERT Zero-Shot classification. 

In [6]:
# Define criteria for identifying higher quality tweets
df_higher_quality = df[(df['contains_link'] == 0) & (df['tweet_length'] > 20) & (df['num_hashtags'] < 5) & (df['num_mentions'] < 5)]

# Extract tweet IDs and dataframe indices for the higher quality tweets
ID_HIGH_QUALITY = df_higher_quality['id'].to_list()
INDEX_HIGHER_QUALITY_TWEETS = df_higher_quality.index.to_list()

# Set a random seed for reproducibility
random.seed(42)

# Get the total number of high-quality tweets
num_high_quality_tweets_ = len(INDEX_HIGHER_QUALITY_TWEETS)

# Randomly sample half of the high-quality tweets for BERT zero-shot task
INDEX_BERT_ZERO_SHOT = random.sample(INDEX_HIGHER_QUALITY_TWEETS, num_high_quality_tweets_ // 2)

# Create a list of indices for unused high-quality tweets (remaining half)
INDEX_UNUSED = [item for item in INDEX_HIGHER_QUALITY_TWEETS if item not in INDEX_BERT_ZERO_SHOT]

# Create a dataframe containing the tweets selected for BERT zero-shot task
df_bert_zeroshot = df.iloc[INDEX_BERT_ZERO_SHOT]

# Save the dataframe to make it easier to come back to. 
# df_bert_zeroshot.to_csv('bert_zeroshot.csv', index=True)
# df_higher_quality.to_csv('df_higher_quality.csv', index=True)

In [4]:
# Load the pre-processed data for BERT zero-shot classification
df_bert_zeroshot = pd.read_csv('/kaggle/input/edgetier-takehome/bert_zeroshot (1).csv')

# Extract the pre-processed text representations (cleaned and stemmed)
doc_stem = df_bert_zeroshot['stem']
doc_clean = df_bert_zeroshot['tweet_cleaned']

# Define a list of example technical issue categories used for zero-shot classification
technical_issue_categories_stem = [
    "login issue",
    "error message",
    "server issue",
    "website issue",
]

# Initialize BERTopic model with specific configurations
topic_model = BERTopic(
    embedding_model="thenlper/gte-small",  # Pre-trained language model for text embedding
    min_topic_size=15,  # Minimum number of documents per topic
    zeroshot_topic_list=technical_issue_categories_stem,  # List of categories for zero-shot classification
    zeroshot_min_similarity=.85,  # Minimum cosine similarity threshold for assigning a zero-shot label
    representation_model=KeyBERTInspired()  # Text representation model (optional)
)

# Fit the BERTopic model on the cleaned tweets
topics, _ = topic_model.fit_transform(doc_clean)

# Define a function to convert zero-shot labels to "Technical" or "Non-Technical"
def zero_shot_to_label(text):
    if text in technical_issue_categories_stem:
        text = 'Technical'
    else:
        text = 'Non-Technical'
    return text

# Apply the zero_shot_to_label function to create a new "label" column for classification
df_bert_zeroshot['zero_shot_label'] = topic_model.get_document_info(doc_clean)['Name']
df_bert_zeroshot['label'] = df_bert_zeroshot['zero_shot_label'].apply(zero_shot_to_label)

# Filter dataframes based on the assigned labels
df_bert_technical = df_bert_zeroshot[df_bert_zeroshot['label'] == 'Technical']
df_bert_non_technical = df_bert_zeroshot[df_bert_zeroshot['label'] == 'Non-Technical']

# Calculate and print informative statistics about the zero-shot classification results
total_sample = len(doc_clean)
total_labled_technical = df_bert_technical['label'].count()
percent_labled_technical = (len(df_bert_technical['label']) / len(df_bert_zeroshot['label'])) * 100

print(f"""
We fed BERT {total_sample} embeddings from our original 500K Tweet Dataset. 
A Total of {total_labled_technical} Tweets were categorised as Technical.
This accounts for {percent_labled_technical:.2f}% of Total Tweets in the Batch we fed the pre-trained Model.
""")

modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/66.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid 


We fed BERT 92266 embeddings from our original 500K Tweet Dataset. 
A Total of 263 Tweets were categorised as Technical.
This accounts for 0.29% of Total Tweets in the Batch we fed the pre-trained Model.



In [18]:
# Save the classified dataframes to separate CSV files to make process a little easier

# DataFrame containing tweets classified as 'Technical' (potentially require further labeling)
# df_bert_technical.to_csv('bert_zero_shot_unlabled.csv', index=False)

# DataFrame containing tweets classified as 'Non-Technical'
# df_bert_non_technical.to_csv('bert_zero_shot_non-technical.csv', index=False)

## Step 2: Turn our Labled BERT Data into a Training / Test Set for Naive Bayes classifier

This section loads the dataframes required for training the Naive Bayes model and performs some initial cleaning steps.

* **Load Dataframes:** We load two dataframes:
    * `df_labled_active_learning`: This dataframe contains tweets that have been labeled as technical or non-technical (potentially through an initial active learning process).
    * `df_bert_non_technical`: This dataframe contains tweets identified as non-technical using BERT zero-shot classification.
* **Balance the Dataset:** We aim for a balanced dataset with an equal number of technical and non-technical tweets. The script calculates the number of technical tweets and the number of non-technical tweets needed to achieve this balance.
* **Sample Non-Technical Tweets:** Randomly samples tweets from the `df_bert_non_technical` dataframe to reach the desired balance.
* **Combine Data:** Creates a new dataframe `df_bayes` by combining the labeled data and the sampled non-technical tweets.
* **Optional Text Preprocessing:** Defines a function `preprocess` for text cleaning (tokenization, stop word removal, stemming). This step might be redundant depending on the pre-processing applied to `df_labled_active_learning` and `df_bert_non_technical`.  

In [15]:
# Load the updated dataframes after manual correction
#   * Active learning labeled data (potentially containing both technical and non-technical tweets)
df_labled_active_learning = pd.read_csv('/kaggle/input/edgetier-takehome/active_learning_labled - reinforced_learning.csv')

#   * Non-technical tweets identified using BERT zero-shot classification
df_bert_non_technical = pd.read_csv('/kaggle/input/edgetier-takehome/bert_zero_shot_non-technical.csv')

# Calculate the number of technical tweets in the active learning data
technical_count = (df_labled_active_learning['label'] == 'Technical').sum()

# Determine the number of non-technical tweets needed to balance the dataset
non_technical_needed = len(df_labled_active_learning) - technical_count

# Sample non-technical tweets randomly (set a seed for reproducibility)
random.seed(42)
random_sample_non_tech = random.sample(df_bert_non_technical['id'].to_list(), non_technical_needed)

# Create a dataframe containing the sampled non-technical tweets
df_bert_non_technical_ = df_bert_non_technical[df_bert_non_technical['id'].isin(random_sample_non_tech)]

# Combine data for training the Naive Bayes classifier
id_ = df_labled_active_learning['id'].to_list() + random_sample_non_tech
content_ = df_labled_active_learning['content'].to_list() + df_bert_non_technical_['content'].to_list()
tweet_clean_ = df_labled_active_learning['tweet_cleaned'].to_list() + df_bert_non_technical_['tweet_cleaned'].to_list()
stem_ = df_labled_active_learning['stem'].to_list() + df_bert_non_technical_['stem'].to_list()
label_ = df_labled_active_learning['label'].to_list() + df_bert_non_technical_['label'].to_list()

# Create a new dataframe (df_bayes) to store the combined data
df_bayes = pd.DataFrame(
{
    'id': id_,
    'content': content_,
    'tweet_cleaned': tweet_clean_,
    'stem': stem_,
    'label': label_
}
)

# Define a function for text pre-processing (optional, might be redundant)
def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words and token.lower() != "'s"]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    stemmed_text = ' '.join(stemmed_tokens)
    return stemmed_text

# Apply pre-processing to the 'tweet_cleaned' column (might be redundant)
#  Finally! we have a dataset containing 50% Technical Tweets and 50% non-Technical we can Train a Naive Bayes classifier on
df_bayes['preprocess'] = df_bayes['tweet_cleaned'].apply(preprocess)

## Step 3: Train Initial Model on Labled Data from BERT & Manual Review

This section prepares the data for training, trains a Naive Bayes model, and evaluates its performance on an unseen test set.

* **Split Data:** Splits the combined data in `df_bayes['preprocess']` (preprocessed text) and `df_bayes['label']` (labels) into training and testing sets using `train_test_split`. The test size is set to 15%.
* **TF-IDF Vectorizer:** Creates a TF-IDF vectorizer to convert textual features into numerical features. TF-IDF considers both the frequency of a word in a document and its overall frequency in the corpus, potentially helping to highlight important terms.
* **Train Vectorizer:** Fits the vectorizer on the training data (`X_train_features`). This helps the vectorizer learn the importance of words in the context of classifying tweets as technical or non-technical.
* **Transform Data:** Transforms the training and testing data using the fitted vectorizer. This converts the text data into numerical features suitable for the Naive Bayes model.
* **Naive Bayes Model:** Creates a Multinomial Naive Bayes model for classification.
* **Train Model:** Trains the Naive Bayes model (`model_bayes`) using the TF-IDF features of the training data (`X_train_features`) and the corresponding labels (`y_train`).
* **Make Predictions:** Uses the trained model to predict labels for the unseen test data (`X_test_features`).
* **Evaluate Performance:** Calculates the accuracy score and classification report to assess the model's performance on the test set. The classification report provides a more detailed breakdown of the model's performance for each class (technical/non-technical).


In [18]:
# Split data into training and testing sets (85% train, 15% test)
X_train, X_test, y_train, y_test = train_test_split(df_bayes['preprocess'], df_bayes['label'], test_size=0.15, random_state=42)

# Create a Multinomial Naive Bayes model for classification
model_bayes = MultinomialNB()

# Create a TF-IDF vectorizer to convert text data to numerical features
vectorizer = TfidfVectorizer()

# Train the vectorizer on the training data (learns word importance)
X_train_features = vectorizer.fit_transform(X_train)

# Transform the test data using the fitted vectorizer
X_test_features = vectorizer.transform(X_test)

# Train the Naive Bayes model using the TF-IDF features
model_bayes.fit(X_train_features, y_train)

# Make predictions on the unseen test data using the trained model
y_pred = model_bayes.predict(X_test_features)

# Evaluate model performance
accuracy_0 = accuracy_score(y_test, y_pred)
class_report_0 = classification_report(y_test, y_pred)

print(f"Accuracy of the Naive Bayes model: {accuracy_0:.4f}")
print("Classification Report:")
print(class_report_0)


Accuracy of the Naive Bayes model: 0.6400
Classification Report:
               precision    recall  f1-score   support

Non-Technical       1.00      0.25      0.40        24
    Technical       0.59      1.00      0.74        26

     accuracy                           0.64        50
    macro avg       0.80      0.62      0.57        50
 weighted avg       0.79      0.64      0.58        50



## Step 4: Build a Few functions to make Active Learning easier

**Functions for incorporating human feedback and iteratively improving model performance**

### `get_random_tweets_train(num_tweets)`

* **Purpose:** Retrieves a random sample of tweets from a higher quality dataframe for active learning.
* **Arguments:** 
    - `num_tweets`: Number of tweets to sample.
* **Steps:**
    1. Identifies available tweet IDs from the higher quality dataframe.
    2. Randomly samples the specified number of tweets.
    3. Creates a dataframe containing the sampled tweets.
    4. Applies text pre-processing to the `'tweet_cleaned'` column.
    5. Updates a global list of sampled IDs to avoid duplicates in future retrievals.
* **Return:** Dataframe containing randomly sampled tweets with preprocessed text.

### `run_prediction_new_df(df, confidence_score)`

* **Purpose:** Predicts labels (technical/non-technical) and confidence scores for a new dataframe of tweets.
* **Arguments:** 
    - `df`: Dataframe containing tweets for prediction.
    - `confidence_score`: Threshold for confidence in predictions.
* **Steps:**
    1. Preprocesses the `'tweet_cleaned'` column in the new dataframe.
    2. Converts preprocessed text to features using the fitted vectorizer.
    3. Predicts labels using the Naive Bayes model.
    4. Predicts probability scores for each class (technical/non-technical).
    5. Adds predicted labels and confidence scores as new columns in the dataframe.
    6. Identifies tweets with confidence scores below the threshold for either class.
* **Return:** Dataframe containing predictions, confidence scores, and tweets recommended for further labeling.

### `add_more_labeled_model(df)`

* **Purpose:** Retrain the model with newly labeled data and evaluate performance.
* **Arguments:** 
    - `df`: Dataframe containing newly labeled tweets.
* **Steps:**
    1. Preprocesses the `'tweet_cleaned'` column in the new labeled data.
    2. Incorporates the new preprocessed text and labels into the existing training data.
    3. Converts the combined training data into features using the fitted vectorizer.
    4. Retrains the Naive Bayes model using the updated training data.
    5. Makes predictions on the existing test set using the retrained model.
    6. Evaluates the retrained model's performance on the test set.
* **Return:** Accuracy score and classification report for the retrained model.


In [22]:
# Function to get random tweets for training (active learning)
def get_random_tweets_train(num_tweets):

    """
    This function retrieves a random sample of tweets from a higher quality dataframe 
    (potentially containing more reliable labels) for active learning.

    Args:
        num_tweets (int): The desired number of tweets to sample.

    Returns:
        pandas.DataFrame: A dataframe containing the randomly sampled tweets with preprocessed text.
    """

    global ids_sampled  # Assuming this variable keeps track of already sampled IDs

    # Identify available IDs from the higher quality data that haven't been used yet
    ids_available = [item for item in df_higher_quality['id'].to_list() 
                     if item not in df_bayes['id'].to_list() or item not in ids_sampled]

    # Sample the desired number of tweets randomly from the available IDs
    random_sample_high_quality = random.sample(ids_available, num_tweets)

    # Create a dataframe containing the sampled tweets
    df_random_tweets_train = df_higher_quality[df_higher_quality['id'].isin(random_sample_high_quality)]

    # Apply pre-processing to the 'tweet_cleaned' column in the new dataframe
    df_random_tweets_train['preprocess'] = df_random_tweets_train['tweet_cleaned'].apply(preprocess)

    # Update the list of sampled IDs to avoid duplicates in future retrievals
    ids_sampled += random_sample_high_quality

    return df_random_tweets_train


# Function to predict labels and confidence scores for a new dataframe
def run_prediction_new_df(df, confidence_score):

    """
    This function predicts labels (technical/non-technical) and confidence scores 
    for a new dataframe containing tweets.

    Args:
        df (pandas.DataFrame): The dataframe containing tweets for prediction.
        confidence_score (float): The threshold for confidence in predictions.

    Returns:
        pandas.DataFrame: A dataframe containing the predicted labels, confidence scores, 
                         and tweets requiring further labeling based on the confidence threshold.
    """

    # Apply pre-processing to the 'tweet_cleaned' column in the new dataframe
    df['preprocess'] = df['tweet_cleaned'].apply(preprocess)

    # Convert preprocessed text to features using the fitted vectorizer
    X_new_features_ = vectorizer.transform(df['preprocess'])

    # Predict labels (technical/non-technical) for the new tweets
    y_new_pred_ = model_bayes.predict(X_new_features_)

    # Predict probability scores for each class (technical/non-technical)
    y_new_proba_ = model_bayes.predict_proba(X_new_features_)
    # Convert the probability scores to a list
    y_new_proba_ = [item for item in y_new_proba_]

    # Add predicted labels and confidence scores as new columns in the dataframe
    df['prediction'] = y_new_pred_
    df['confidenct_non_tech'] = [item[0] for item in y_new_proba_]  # Confidence in non-technical
    df['confidenct_tech'] = [item[1] for item in y_new_proba_]  # Confidence in technical

    # Identify tweets where the confidence score is below the threshold for either class
    to_label_df = df[
        ((df['prediction'] == 'Technical') & (df['confidenct_tech'] < confidence_score)) |
        ((df['prediction'] == 'Non-Technical') & (df['confidenct_non_tech'] < confidence_score))
    ]

    # Return the dataframe containing predictions, confidence scores, and tweets for further labeling
    return to_label_df

def add_more_labeled_model(df):

    """
    This function incorporates newly labeled data into the training data 
    and retrains the Naive Bayes model. It also evaluates the model performance 
    on the existing test set after retraining.

    Args:
        df (pandas.DataFrame): The dataframe containing newly labeled tweets.

    Returns:
        float: The accuracy score of the retrained model on the test set.
        str: The classification report for the retrained model on the test set.
    """

    global current_X_train, current_y_train  # Assuming these store current training data

    # Apply text pre-processing to the 'tweet_cleaned' column in the new labeled data
    df['preprocess'] = df['tweet_cleaned'].apply(preprocess)

    # Concatenate the new preprocessed text with existing training features
    new_X_train = pd.concat([current_X_train, df['preprocess']], ignore_index=True)

    # Concatenate the new labels with existing training labels
    new_y_train = pd.concat([current_y_train, df['label']], ignore_index=True)

    # Convert the combined training data into features using the fitted vectorizer
    new_X_train_features = vectorizer.transform(new_X_train)

    # No need to transform the test data again, it's already transformed (assuming X_test_features is defined elsewhere)

    # Retrain the Naive Bayes model using the updated training data
    model_bayes.fit(new_X_train_features, new_y_train)

    # Update the global variables to store the new training data for future retraining
    current_X_train = new_X_train
    current_y_train = new_y_train

    # Make predictions on the existing test set using the retrained model
    y_pred = model_bayes.predict(X_test_features)

    # Evaluate the performance of the retrained model on the test set
    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    # Return the accuracy score and classification report for the retrained model
    return accuracy, class_report


## Step 5: Active Learning Iterations
**Use the trained Naive Bayes Model to predict Tweet categories on Random Tweets from the High Quality List, manually Label any Tweet with a low confidence score & plug that Tweet into the Model for retraining**

1. **Sample Tweets:** The `get_random_tweets_train` function retrieves a random sample of tweets from a higher quality dataframe (potentially containing more reliable labels). This helps focus on potentially informative examples, especially when dealing with imbalanced datasets.

2. **Predict Labels and Confidence Scores:** The `run_prediction_new_df` function predicts labels (technical/non-technical) and confidence scores for the sampled tweets. A confidence score threshold is used to identify tweets where the model is less certain about its prediction. These tweets are considered good candidates for human labeling as they have the potential to significantly improve the model's performance.

3. **Manual Labeling:** The tweets identified in step 2 (potentially saved to a CSV file) are presented for manual labeling. This human intervention helps provide more reliable labels for these uncertain tweets.

4. **Retrain Model:** The `add_more_labeled_model` function incorporates the newly labeled data from step 3 into the existing training data. This enriched training data is then used to retrain the Naive Bayes model.

5. **Evaluate Performance:** The performance of the retrained model is evaluated on the existing test set using metrics like accuracy and classification report. This helps assess the effectiveness of the active learning loop in improving the model's ability to classify tweets correctly.

In [20]:
# List to store IDs of tweets already sampled for active learning
ids_sampled = []

# Global variables to store current training data for the Naive Bayes model
current_X_train = X_train
current_y_train = y_train

In [23]:
# Sample 1000 tweets from the higher quality data for active learning
df_random_tweets_train_1 = get_random_tweets_train(1000)

# Predict labels and confidence scores for the sampled tweets
df_to_label_1 = run_prediction_new_df(df_random_tweets_train_1, 0.55)

# # Optionally, save the tweets requiring labeling for manual review (commented out)
# df_to_label_1.to_csv('to_label_1.csv')

# Read the manually labeled data (presumably from 'to_label_1.csv' or similar source)
df_labeled_1 = pd.read_csv('/kaggle/input/edgetier-takehome/labled_1.csv')

# Retrain the Naive Bayes model with the newly labeled data
accuracy_1, class_report_1 = add_more_labeled_model(df_labeled_1)

# Print the accuracy score and classification report for the retrained model
print(accuracy_1)
print(class_report_1)

0.72
               precision    recall  f1-score   support

Non-Technical       0.63      1.00      0.77        24
    Technical       1.00      0.46      0.63        26

     accuracy                           0.72        50
    macro avg       0.82      0.73      0.70        50
 weighted avg       0.82      0.72      0.70        50



In [24]:
# Sample 1000 tweets from the higher quality data for active learning (iteration 2)
df_random_tweets_train_2 = get_random_tweets_train(1000)

# Predict labels and confidence scores for the sampled tweets (iteration 2)
df_to_label_2 = run_prediction_new_df(df_random_tweets_train_2, 0.65)

# # Optionally, save the tweets requiring labeling for manual review (commented out)
# df_to_label_2.to_csv('to_label_2.csv')

# Read the manually labeled data (presumably from 'to_label_2.csv' or similar source)
df_labeled_2 = pd.read_csv('/kaggle/input/edgetier-takehome/labled_2.csv')

# Retrain the Naive Bayes model with the newly labeled data (iteration 2)
accuracy_2, class_report_2 = add_more_labeled_model(df_labeled_2)

# Print the accuracy score and classification report for the retrained model (iteration 2)
print(accuracy_2)
print(class_report_2)

0.66
               precision    recall  f1-score   support

Non-Technical       0.59      1.00      0.74        24
    Technical       1.00      0.35      0.51        26

     accuracy                           0.66        50
    macro avg       0.79      0.67      0.63        50
 weighted avg       0.80      0.66      0.62        50



In [25]:
# Sample 1000 tweets from the higher quality data for active learning (iteration 3)
df_random_tweets_train_3 = get_random_tweets_train(1000)

# Predict labels and confidence scores for the sampled tweets (iteration 3)
df_to_label_3 = run_prediction_new_df(df_random_tweets_train_3, 0.60)

# Save the tweets requiring labeling for manual review (iteration 3)
df_to_label_3.to_csv('to_label_3.csv')

# Read the manually labeled data (presumably from 'to_label_3.csv')
df_labeled_3 = pd.read_csv('/kaggle/input/edgetier-takehome/labled_3.csv')

# Retrain the Naive Bayes model with the newly labeled data (iteration 3)
accuracy_3, class_report_3 = add_more_labeled_model(df_labeled_3)

# Print the accuracy score and classification report for the retrained model (iteration 3)
print(accuracy_3)
print(class_report_3)

0.64
               precision    recall  f1-score   support

Non-Technical       0.57      1.00      0.73        24
    Technical       1.00      0.31      0.47        26

     accuracy                           0.64        50
    macro avg       0.79      0.65      0.60        50
 weighted avg       0.79      0.64      0.59        50



## Step 6: Save the Model the Model, Read it back it & test on some Tweets
**The Test set is relativley small, after a few iterations of Active Learning it looks like the Model is doing a pretty good job recognising Technical Tweets, Let's see how it performs on some fresh Tweets**

In [30]:
# Save the model to a file
joblib.dump(model_bayes, 'multinomial_nb_model.pkl')

# Read the model back in
loaded_model = joblib.load('/kaggle/input/edgetier-takehome/multinomial_nb_model.pkl')

['multinomial_nb_model.pkl']

In [61]:
def predict_category(text):
    model = loaded_model
    preprocessed_text = preprocess(text)
    vectorized_text = vectorizer.transform([preprocessed_text])
    category = model.predict(vectorized_text)
    
    return category[0]

tweets_non_technical = [
    "I'm constantly amazed by how cool ChatGPT is! It's like having a conversation with a genius.",
    "ChatGPT never fails to impress me with its capabilities. It's simply awesome!",
    "Just had the most amazing conversation with ChatGPT. It's mind-blowing how smart it is!",
    "Every time I use ChatGPT, I'm reminded of how advanced AI technology has become. Truly fascinating!",
    "ChatGPT is so cool! It's like having a virtual assistant that knows everything.",
    "I can't get over how cool ChatGPT is. It's like having a super-intelligent friend to chat with!",
    "Using ChatGPT feels like peeking into the future. It's incredible what AI can do!",
    "Just had an enlightening conversation with ChatGPT. It's seriously impressive!",
    "ChatGPT never ceases to amaze me. It's like having a personal AI companion.",
    "Can we just take a moment to appreciate how cool ChatGPT is? It's revolutionizing the way we interact with AI!"
]

tweets_technical = [
    "ChatGPT's server is down, so frustrating!",
    "Having trouble logging into ChatGPT. Is anyone else experiencing this issue?",
    "Is it just me or is ChatGPT server down right now? I need it for work!",
    "Tried logging into ChatGPT multiple times but it's not working. What's going on?",
    "Getting error messages when trying to access ChatGPT. This is frustrating.",
    "Unable to log in to ChatGPT. Keep getting a 'server error' message.",
    "ChatGPT seems to be server down. Can't access my conversations.",
    "Anyone else having issues with ChatGPT? I keep getting a connection timeout error.",
    "Need to use ChatGPT urgently but it's not loading. What's going on?",
    "Just when I need ChatGPT the most, it's down. Seriously frustrating!"
] 

for tweet in tweets_non_technical:
    print(predict_category(tweet))
    
for tweet in tweets_technical:
    print(predict_category(tweet))

Non-Technical
Non-Technical
Non-Technical
Non-Technical
Non-Technical
Non-Technical
Non-Technical
Non-Technical
Non-Technical
Non-Technical
Technical
Technical
Technical
Technical
Technical
Technical
Technical
Non-Technical
Non-Technical
Non-Technical
