In [2]:
!pip install transformers ekphrasis datasets -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
import re
import string
import nltk
import pandas as pd
import warnings
import mlflow

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

pd.options.display.max_colwidth = None
pd.options.display.max_rows = None
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv("../data/raw/train.csv")
test = pd.read_csv("../data/raw/test.csv")

In [4]:
print(train.shape)
print(test.shape)

(7613, 5)
(3263, 4)


In [34]:
TRACKING_SERVER_HOST = "34.232.18.202"
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

## Preprocess Text

In [9]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


Going over the dataset we find:

- The text contains punctuations, hashtags, numeric figures, mentions, URLs, Dates, Accented Characters, Contractions
- Text also contains characters like `&gt`, `&amp`, `\n`.
- We'll also find duplicated tweets, however there are some discrepancies between the duplicated tweets. For the same tweet we have different labels.

In [10]:
# Duplicated tweets (Expand to see more)
train[train.text.duplicated(keep=False)].sort_values(by='text').head()

Unnamed: 0,id,keyword,location,text,target


Let's start by cleaning up the text.

### Clearning up the text

In [7]:
# Drop the duplicates from the dataframe
train = train.drop_duplicates(subset=['text', 'target']).reset_index(drop=True)

# After manually going through the tweets with different target values
# Assign the target values to the tweets to find the duplicates
non_disaster =  [4253, 4182, 3212, 4249, 6535, 1190, 4239, 3936, 1214, 6018]
disaster = [4193, 2803, 4554, 4250, 1207, 4317, 620, 5573]
train.loc[non_disaster, 'target'] = 0
train.loc[disaster, 'target'] = 1

# Again drop the duplicates from the dataframe
train = train.drop_duplicates(subset=['text', 'target']).reset_index(drop=True)

In [22]:
# Cleaning the text
text_processor = TextPreProcessor(
    # terms that will be omitted
    omit=['url', 'email'],
    # terms that will be normalized
    normalize=['percent', 'money', 'phone', 'user',
        'time', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens

    # corpus from which the word statistics are going to be used
    # for word segmentation
    segmenter="twitter",

    # corpus from which the word statistics are going to be used
    # for spell correction
    corrector="twitter",

    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,

    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [None]:
# Cleaned Text
train['processed_text'] = train.text.apply(lambda x: " ".join(text_processor.pre_process_doc(x)))
test['processed_text'] = test.text.apply(lambda x: " ".join(text_processor.pre_process_doc(x)))

## Converting text to vectors

There are many ways to convert text to vectors. We will use the TF-IDF methods.

In [11]:
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', min_df=2, max_df=0.75, ngram_range=(1,2))

In [12]:
# Apply TF-IDF to the text data
train_vectorized = tfidf.fit_transform(train.processed_text)
test_vectorized = tfidf.transform(test.processed_text)

## Perform Stratified K-Fold Cross-Validation

In [20]:
scv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [46]:
class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, feature_name, text_processor):
        self.feature_name = feature_name
        self.text_processor = text_processor
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['processed_' + self.feature_name] = X[self.feature_name].apply(lambda x: ' '.join(self.text_processor.pre_process_doc(x)))
        return X['processed_' + self.feature_name]

In [47]:
text_clf_pipeline = Pipeline([
    ('cleaner', TextCleaner('text', text_processor)),
    ('vectorizer', TfidfVectorizer(stop_words='english', min_df=2, max_df=0.75, ngram_range=(1,2))),
    ('clf', None)
])

## Modeling

In [14]:
def calculate_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    matrix = confusion_matrix(y_true, y_pred)
    return {
        "accuracy": acc,
        "f1": f1,
        "recall": recall,
        "precision": precision,
        "confusion_matrix": matrix
    }


In [33]:
# Vanilla Models
models = {
    'logistic_regression': LogisticRegression(),
    'random_forest': RandomForestClassifier(),
    'xgboost': XGBClassifier(),
    'multinomial_nb': MultinomialNB(),
    'svm': SVC()
}

In [35]:
EXPERIMENT_NAME = "vanilla-model-experiment"
mlflow.set_experiment(EXPERIMENT_NAME)

2023/08/14 20:57:10 INFO mlflow.tracking.fluent: Experiment with name 'vanilla-model-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlops-zc-ta-dev-model-registry/1', creation_time=1692046630442, experiment_id='1', last_update_time=1692046630442, lifecycle_stage='active', name='vanilla-model-experiment', tags={}>

In [37]:
for model in models:
    # Cross-validation
    # clf = models[model]
    with mlflow.start_run():
        mlflow.set_tag('developer', 'sagar')
    
        mlflow.set_tag('model', model)
        text_clf_pipeline.set_params(clf=models[model])

        scores = cross_val_score(text_clf_pipeline, train, train.target, cv=scv, scoring='f1')
        mlflow.log_metric('f1', scores.mean())
        mlflow.log_metric('f1_std', scores.std())
        
        print(f'{model} F1: {scores.mean():.3f} +/- {scores.std():.3f}')

        mlflow.sklearn.log_model(text_clf_pipeline, 'models')

logistic_regression F1: 0.736 +/- 0.011
random_forest F1: 0.689 +/- 0.018
xgboost F1: 0.698 +/- 0.019
multinomial_nb F1: 0.729 +/- 0.008
svm F1: 0.731 +/- 0.010


`Logistic Regression`, `Multinomial NB`, and `SVM` model performance were similar with `Logistic Regression` performing slightly better. Let's further tune these models to see if we can improve performance.

### Hyperparameter Tuning of Vanilla Models

In [51]:
params = {
    'logistic_regression': {
        'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'clf__penalty': ['l2', 'l1'],
        'clf__solver': ['sag', 'liblinear']
    },
    'multinomial_nb': {
        'clf__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    'svm': {
        'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'clf__gamma': ['scale', 'auto']
    }
}

In [39]:
EXPERIMENT_NAME = "hyperparameter-tuning-experiment"
mlflow.set_experiment(EXPERIMENT_NAME)

2023/08/14 21:30:50 INFO mlflow.tracking.fluent: Experiment with name 'hyperparameter-tuning-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlops-zc-ta-dev-model-registry/2', creation_time=1692048650611, experiment_id='2', last_update_time=1692048650611, lifecycle_stage='active', name='hyperparameter-tuning-experiment', tags={}>

In [53]:
n_iter_search = 10

for model in params:
    with mlflow.start_run():
        # clf = models[model]
        mlflow.set_tag('developer', 'sagar')
    
        mlflow.set_tag('model', model)
        text_clf_pipeline.set_params(clf=models[model])

        random_search = RandomizedSearchCV(text_clf_pipeline, param_distributions=params[model], n_iter=n_iter_search, cv=scv, scoring='f1', random_state=42)
        random_search.fit(train, train.target)

        mlflow.log_params(random_search.best_params_)
        mlflow.sklearn.log_model(random_search.best_estimator_, 'models')

        print('Model: {}'.format(model))
        print('Best score: {}'.format(random_search.best_score_))
        print('Best params: {}'.format(random_search.best_params_))
        print('')


Model: logistic_regression
Best score: 0.7363363734175435
Best params: {'clf__solver': 'liblinear', 'clf__penalty': 'l2', 'clf__C': 1}

Model: multinomial_nb
Best score: 0.7376485560625254
Best params: {'clf__alpha': 0.1}

Model: svm
Best score: 0.7446399970494169
Best params: {'clf__kernel': 'sigmoid', 'clf__gamma': 'scale', 'clf__C': 1}



The cross validation scores for the models are very similar. `SVM` has the highest score, but it is not significantly higher than the other models. For simplicity, speed, and better interpretability, I will use `MultiNomialNB` for the final model.

## Predict on the Test Set

In [54]:
clf_NB = MultinomialNB(alpha=0.1)
text_clf_pipeline.set_params(clf=clf_NB)
text_clf_pipeline.fit(train, train.target)

y_pred = text_clf_pipeline.predict(test)

In [55]:
submission = pd.DataFrame({
    'id': test['id'],
    'target': y_pred
})

In [56]:
submission.to_csv('../data/submission.csv', index=False)

In [57]:
!kaggle competitions submit -c nlp-getting-started -f ../data/submission.csv -m "First submission - NB"

100%|██████████████████████████████████████| 22.2k/22.2k [00:00<00:00, 47.6kB/s]
Successfully submitted to Natural Language Processing with Disaster Tweets

Multinomial Naive Bayes Scores:
- F1 Score on train set: 0.73764
- F1 Score on test set: 0.79834

The model performance does not indicate overfitting. The F1 score on the test set is higher than the F1 score on the train set. The model is performing well on the test set.

## Transformer Models (Future Scope)

In [16]:
import torch

from tqdm import tqdm
from transformers import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import CrossEntropyLoss

In [17]:
num_classes = 2  # The number of classes in the dataset
model_name = "bert-base-uncased"  # Use the appropriate BERT model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Create a dataset from the pandas dataframe
train_df = Dataset.from_pandas(train[['processed_text', 'target']])

In [19]:
batch_size = 64

# Tokenize input text
encoded_data_train = tokenizer.batch_encode_plus(train_df['processed_text'], add_special_tokens=True, padding=True, truncation=True, return_tensors='pt')

# Create DataLoader
train_dataset = TensorDataset(encoded_data_train['input_ids'], encoded_data_train['attention_mask'], torch.tensor(train_df['target']))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [22]:
learning_rate = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = CrossEntropyLoss()

In [24]:
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        progress_bar.set_postfix({'Loss': loss.item()}, refresh=True)
    progress_bar.close()



In [25]:
model.save_pretrained('./fine_tuned_model')

In [None]:
# Giscart
# Validation of ML models
# Product manager
# Try to find things not working properly
# giskard