# Set up

In [2]:
!pip install snorkel

In [None]:
import pandas as pd
import numpy as np

In [None]:
user = pd.read_csv('./users.tsv', sep='\t')
ratings = pd.read_csv('./ratings.tsv', sep='\t')
watching = pd.read_csv('./watching.tsv', sep='\t')
user.head()

Unnamed: 0,user_id,age,occupation,gender
0,567044,33,executive/managerial,M
1,541929,33,homemaker,M
2,538609,33,sales/marketing,F
3,344886,31,sales/marketing,M
4,612606,28,college/grad student,F


In [None]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,505690,tracers+2015,3
1,347367,dylan+moran+monster+2004,4
2,687004,leave+her+to+heaven+1945,3
3,137351,seasons+greetings+1996,4
4,492141,in+dreams+1999,3


In [None]:
# Define the label mappings for convenience
ABSTAIN = -1
NOT_SPAM = 0
SPAM = 1

In [None]:
from snorkel.labeling import labeling_function

# Define rating thresholds
LIKE_THRESHOLD = 4
DISLIKE_THRESHOLD = 2

# Labeling functions based on user ratings
@labeling_function()
def lf_like_high_rating(x):
    """Label as like if user gave a high rating."""
    if x.rating >= LIKE_THRESHOLD:
        return SPAM
    else:
        return NOT_SPAM 

@labeling_function()
def lf_dislike_low_rating(x):
    """Label as dislike if user gave a low rating."""
    if x.rating <= DISLIKE_THRESHOLD:
        return ABSTAIN
    else:
        return NOT_SPAM 

@labeling_function()
def lf_neutral_rating(x):
    """Label as neutral if user gave a rating between DISLIKE_THRESHOLD and LIKE_THRESHOLD."""
    if x.rating > DISLIKE_THRESHOLD and x.rating < LIKE_THRESHOLD:
        return ABSTAIN
    elif x.rating == 5 or x.rating == 4:
        return SPAM
    else:
        return NOT_SPAM

In [None]:
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier, LFAnalysis

# Define the set of labeling functions (LFs)
lfs = [lf_like_high_rating, lf_dislike_low_rating, lf_neutral_rating]
# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(ratings)
# print(L_train.shape)
# (5351, 3) L_train is an NumPy ndarray that has a shape of 
#(n, len(lfs)), where n is the number of examples in the dataset 
#and len(lfs) is the number of labeling functions.

# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
ratings["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")

100%|██████████| 5351/5351 [00:00<00:00, 22142.28it/s]
100%|██████████| 500/500 [00:00<00:00, 1189.33epoch/s]


In [None]:
df_train_ratings = ratings[ratings.label != ABSTAIN]
df_train_ratings.head()

Unnamed: 0,user_id,movie_id,rating,label
0,505690,tracers+2015,3,0
1,347367,dylan+moran+monster+2004,4,1
2,687004,leave+her+to+heaven+1945,3,0
3,137351,seasons+greetings+1996,4,1
4,492141,in+dreams+1999,3,0


In [None]:
import random
import nltk
from nltk.corpus import wordnet as wn
from snorkel.augmentation import transformation_function

nltk.download("wordnet", quiet=True)

def get_synonyms(word):
    """Get the synonyms of word from Wordnet."""
    words = word.split("+")
    synonyms = set()
    for w in words:
        lemmas = set().union(*[s.lemmas() for s in wn.synsets(w)])
        synonyms.update(set(l.name().lower().replace("_", " ") for l in lemmas) - {w})
    return list(synonyms)

@transformation_function()
def tf_replace_word_with_synonym(x):
    """Try to replace a random word in the movie title with a synonym."""
    words = x.movie_id.lower().split("+")
    idx = random.choice(range(len(words)))
    synonyms = get_synonyms(words[idx])
    if len(synonyms) > 0:
        x.movie_id = " ".join(words[:idx] + [synonyms[0]] + words[idx + 1 :])
        return x

In [None]:
from snorkel.augmentation import ApplyOnePolicy, PandasTFApplier

tf_policy = ApplyOnePolicy(n_per_original=2, keep_original=True)
tf_applier = PandasTFApplier([tf_replace_word_with_synonym], tf_policy)
df_train_augmented = tf_applier.apply(df_train_ratings)
df_train_augmented.head()

100%|██████████| 5351/5351 [00:03<00:00, 1420.98it/s]


Unnamed: 0,user_id,movie_id,rating,label
0,505690,tracers+2015,3,0
0,505690,tracer 2015,3,0
1,347367,dylan+moran+monster+2004,4,1
1,347367,bob dylan moran monster 2004,4,1
2,687004,leave+her+to+heaven+1945,3,0


In [None]:
for i, row in df_train_ratings.iterrows():
    words = row.movie_id.split("+")
    for idx in range(len(words)):
        # print(words)
        # print(row)
        synonyms = get_synonyms(words[idx])
        #print(words[idx])
        if len(synonyms) > 0:
            # Test the transformation function on this row
            row_after = tf_replace_word_with_synonym(row)
            print()
            if row_after is not None:
                print("Original movie_id: ", row.movie_id)
                print("Augmented movie_id: ", row_after.movie_id)
                break
    if row_after is not None:
        break


Original movie_id:  tracers+2015
Augmented movie_id:  tracer 2015


In [None]:
df_train_augmented.head()

Unnamed: 0,user_id,movie_id,rating,label
0,505690,tracers+2015,3,0
0,505690,tracer 2015,3,0
1,347367,dylan+moran+monster+2004,4,1
1,347367,bob dylan moran monster 2004,4,1
2,687004,leave+her+to+heaven+1945,3,0


In [None]:
from snorkel.slicing import slicing_function

@slicing_function()
def slice_high_rating(df):
    return df[df.rating >= 4].index

# Apply the slicing function to the DataFrame
df_sliced = df_train_augmented.loc[slice_high_rating(df_train_ratings)].reset_index(drop=True)
df_sliced.head(20)

Unnamed: 0,user_id,movie_id,rating,label
0,347367,dylan+moran+monster+2004,4,1
1,347367,bob dylan moran monster 2004,4,1
2,137351,seasons+greetings+1996,4,1
3,137351,temper greetings 1996,4,1
4,137351,seasons recognise 1996,4,1
5,295260,blow+out+1981,4,1
6,295260,burn out out 1981,4,1
7,295260,burn out out 1981,4,1
8,328775,devil+times+five+1974,4,1
9,328775,devil time five 1974,4,1
