In [11]:
!pip install snorkel datasets scikit-learn vaderSentiment -q


In [12]:
import re
import random
import pandas as pd
from datasets import load_dataset

from snorkel.augmentation import transformation_function
from snorkel.augmentation import ApplyEachPolicy, PandasTFApplier

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [13]:
dataset = load_dataset("imdb")

df_train = pd.DataFrame(dataset["train"]).sample(2000, random_state=42).reset_index(drop=True)
df_test  = pd.DataFrame(dataset["test"]).sample(2000, random_state=42).reset_index(drop=True)

df_train.head()


Unnamed: 0,text,label
0,"Dumb is as dumb does, in this thoroughly unint...",0
1,I dug out from my garage some old musicals and...,1
2,After watching this movie I was honestly disap...,0
3,This movie was nominated for best picture but ...,1
4,Just like Al Gore shook us up with his painful...,1


In [14]:
df_train_small = df_train.sample(1000, random_state=0).reset_index(drop=True)
df_train_small.label.value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,511
1,489


In [15]:
synonym_map_positive = {
    "good": "great",
    "great": "fantastic",
    "amazing": "incredible",
    "love": "adore",
    "liked": "enjoyed",
    "nice": "pleasant",
}

synonym_map_negative = {
    "bad": "awful",
    "boring": "dull",
    "terrible": "horrible",
    "hate": "despise",
    "worst": "lousiest",
}

def replace_words(text, mapping):
    for k, v in mapping.items():
        text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
    return text


In [18]:
import pandas as pd
import random
import re
from snorkel.augmentation import transformation_function

def replace_words(text, mapping):
    for k, v in mapping.items():
        text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
    return text


@transformation_function()
def tf_synonym_positive(x):
    if x.label == 1:
        new_text = replace_words(x.text, synonym_map_positive)
        return pd.Series({"text": new_text, "label": x.label})
    return None


@transformation_function()
def tf_synonym_negative(x):
    if x.label == 0:
        new_text = replace_words(x.text, synonym_map_negative)
        return pd.Series({"text": new_text, "label": x.label})
    return None


@transformation_function()
def tf_add_emoji(x):
    if x.label == 1:
        return pd.Series({"text": x.text + " ðŸ˜Š", "label": x.label})
    elif x.label == 0:
        return pd.Series({"text": x.text + " ðŸ˜¡", "label": x.label})
    return None


@transformation_function()
def tf_random_dropout(x, drop_prob=0.1):
    words = x.text.split()
    new_words = [w for w in words if random.random() > drop_prob]

    if len(new_words) == 0:
        return None

    return pd.Series({"text": " ".join(new_words), "label": x.label})


In [19]:
tfs = [
    tf_synonym_positive,
    tf_synonym_negative,
    tf_add_emoji,
    tf_random_dropout
]

policy = ApplyEachPolicy(n_tfs=len(tfs))
tf_applier = PandasTFApplier(tfs=tfs, policy=policy)

augmented_df = tf_applier.apply(df_train_small)

augmented_df.head()


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:02<00:00, 393.61it/s]


Unnamed: 0,text,label
0,This is one of the most hilariously bad movies...,0
0,This is one of the most hilariously awful movi...,0
1,This is one of the most hilariously bad movies...,0
2,This is one of most hilariously bad movies I h...,0
1,"OK, I overrated it just a bit to offset at lea...",1


In [20]:
df_augmented = pd.concat(
    [df_train_small[["text", "label"]], augmented_df[["text", "label"]]],
    ignore_index=True
)

print("Original:", len(df_train_small))
print("Augmented:", len(augmented_df))
print("Total:", len(df_augmented))


Original: 1000
Augmented: 4000
Total: 5000


In [21]:
vectorizer_base = TfidfVectorizer(max_features=10000)
X_train_base = vectorizer_base.fit_transform(df_train_small.text)
y_train_base = df_train_small.label

clf_base = LogisticRegression(max_iter=400)
clf_base.fit(X_train_base, y_train_base)

X_test = vectorizer_base.transform(df_test.text)
y_test = df_test.label

acc_base = accuracy_score(y_test, clf_base.predict(X_test))
print("Baseline accuracy:", acc_base)


Baseline accuracy: 0.791


In [22]:
vectorizer_aug = TfidfVectorizer(max_features=10000)
X_train_aug = vectorizer_aug.fit_transform(df_augmented.text)
y_train_aug = df_augmented.label

clf_aug = LogisticRegression(max_iter=400)
clf_aug.fit(X_train_aug, y_train_aug)

X_test_aug = vectorizer_aug.transform(df_test.text)
acc_aug = accuracy_score(y_test, clf_aug.predict(X_test_aug))

print("Accuracy after augmentation:", acc_aug)
print("Improvement:", acc_aug - acc_base)


Accuracy after augmentation: 0.815
Improvement: 0.02399999999999991
