In [91]:
from pathlib import Path
import pandas as pd
import numpy as np
from nltk.stem.snowball import SnowballStemmer 
import sys
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import json

sys.path.insert(0, str(Path.cwd().parent / 'src'))

from utils.params import get_topn_param_sets

In [5]:
PARAMS_PATH = Path(r'C:\Users\rooty\UWEC\Research\CyberBullyingML\cyberbullyingml\cyberbullying-ml\official\hyp_search\results')
DATA_PATH = Path(r'C:\Users\rooty\UWEC\Research\CyberBullyingML\cyberbullyingml\cyberbullying-ml\data\en_only')
DATASET_1_NAME = '48000_cyberbullying_tweets_basic_clean.csv'
DATASET_2_NAME = 'hatespeech_tweets_basic_clean.csv'
DATASET_3_NAME = 'onlineHarassmentDataset_basic_clean.csv'

RANDOM_SEED = 115

In [6]:
d1 = pd.read_csv(DATA_PATH / DATASET_1_NAME)
d2 = pd.read_csv(DATA_PATH / DATASET_2_NAME)
d3 = pd.read_csv(DATA_PATH / DATASET_3_NAME)

d1.dropna(axis=0, inplace=True)
d1.drop_duplicates(inplace=True)
d1.reset_index(drop=True, inplace=True)

d2.dropna(axis=0, inplace=True)
d2.drop_duplicates(inplace=True)
d2.reset_index(drop=True, inplace=True)

d3.dropna(axis=0, inplace=True)
d3.drop_duplicates(inplace=True)
d3.reset_index(drop=True, inplace=True)

d1['label'] = d1['label'].apply(lambda label: 0 if label =='notcb' else 1)
d2['class'] = d2['class'].apply(lambda label: 1 if label == 0 else 0)
d3['Code'] = d3['Code'].apply(lambda label: 1 if label == 'H' else 0)

d1.rename(columns={'tweet': 'text'}, inplace=True)
d2.rename(columns={'tweet': 'text'}, inplace=True)
d2.rename(columns={'class': 'label'}, inplace=True)
d3.rename(columns={'Code': 'label'}, inplace=True)
d3.rename(columns={'Tweet': 'text'}, inplace=True)

In [87]:
x_train, x_test, y_train, y_test  = train_test_split(d1['text'], d1['label'], random_state=RANDOM_SEED)

In [None]:
tfidfv = TfidfVectorizer()

In [None]:
cv = CountVectorizer()
x_train = cv.fit_transform(x_train)
x_test = cv.transform(x_test)

In [86]:
from collections import defaultdict
import array
import scipy.sparse as sp

def _make_int_array():
    """Construct an array.array of a type suitable for scipy.sparse indices."""
    return array.array(str("i"))


class MyVectorizer(CountVectorizer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.oov_token_count = 0
        self.iv_token_count = 0
        
    def _count_vocab(self, raw_documents, fixed_vocab):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False"""
        if fixed_vocab:
            vocabulary = self.vocabulary_
        else:
            # Add a new value when a new vocabulary item is seen
            vocabulary = defaultdict()
            vocabulary.default_factory = vocabulary.__len__

        analyze = self.build_analyzer()
        j_indices = []
        indptr = []

        values = _make_int_array()
        indptr.append(0)
        for doc in raw_documents:
            feature_counter = {}
            for feature in analyze(doc):
                try:
                    feature_idx = vocabulary[feature]                    
                    if fixed_vocab == True: self.iv_token_count += 1
                    if feature_idx not in feature_counter:
                        feature_counter[feature_idx] = 1
                    else:
                        feature_counter[feature_idx] += 1
                    
                except KeyError:        
                    if fixed_vocab == True: self.oov_token_count += 1
                    # Ignore out-of-vocabulary items for fixed_vocab=True
                    continue

            j_indices.extend(feature_counter.keys())
            values.extend(feature_counter.values())
            indptr.append(len(j_indices))

        if not fixed_vocab:
            # disable defaultdict behaviour
            vocabulary = dict(vocabulary)
            if not vocabulary:
                raise ValueError(
                    "empty vocabulary; perhaps the documents only contain stop words"
                )

        if indptr[-1] > np.iinfo(np.int32).max:  # = 2**31 - 1
            # if _IS_32BIT:
            #     raise ValueError(
            #         (
            #             "sparse CSR array has {} non-zero "
            #             "elements and requires 64 bit indexing, "
            #             "which is unsupported with 32 bit Python."
            #         ).format(indptr[-1])
            #     )
            indices_dtype = np.int64

        else:
            indices_dtype = np.int32
        j_indices = np.asarray(j_indices, dtype=indices_dtype)
        indptr = np.asarray(indptr, dtype=indices_dtype)
        values = np.frombuffer(values, dtype=np.intc)

        X = sp.csr_matrix(
            (values, j_indices, indptr),
            shape=(len(indptr) - 1, len(vocabulary)),
            dtype=self.dtype,
        )
        X.sort_indices()
        return vocabulary, X

In [88]:
v = MyVectorizer(max_df=.9)
x_train = v.fit_transform(x_train)

In [72]:
x_test = v.transform(x_test)

In [55]:
v.iv_token_count

250373

In [56]:
v.oov_token_count

7631

In [57]:
total = v.oov_token_count + v.iv_token_count 
ratio = v.oov_token_count / total

print(f'total token count = {total}')
print(f'ratio of oov to total = {ratio}')

total token count = 258004
ratio of oov to total = 0.029577060820762467


In [81]:
x_train1, x_test1, y_train, y_test  = train_test_split(d1['text'], d1['label'], random_state=RANDOM_SEED)


In [66]:
all(x_test1 == x_test)

True

In [79]:
sum(x_test[0].data)

50

In [82]:
x_test1 = x_test1.apply(lambda doc: sum(v.transform([doc])[0].data))
x_test1

43401    50
12549    30
32707    24
14919    22
10689    15
         ..
37513    20
31       28
8799     41
5568     26
31500     8
Name: text, Length: 10919, dtype: int64

In [83]:
x_test1.sum()

250373

In [88]:
len(x_test.data)

221521

In [77]:
from scipy.sparse._csr import csr_matrix
from numpy import array

In [85]:
docs = [["hello", "world", "hello", "friend"], ["goodbye", "cruel", "world"]]
indptr = [0]
indices = []
data = []
vocabulary = {}
for d in docs:
    for term in d:
        index = vocabulary.setdefault(term, len(vocabulary))
        indices.append(index)
        data.append(1)
    indptr.append(len(indices))

csr_matrix((data, indices, indptr), dtype=int).toarray()


array([[2, 1, 1, 0, 0],
       [0, 1, 0, 1, 1]])

In [76]:
x_test[]

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 221521 stored elements and shape (10919, 38727)>

In [92]:
results_file = Path.cwd() / 'oov_results'
results = []

for idx, (tr_ds_name, te_ds_name, tr_df, te_df) in enumerate([('d1', 'd2', d1, d2), ('d1', 'd3', d1, d3), ('d2', 'd1', d2, d1), ('d2', 'd3', d2, d3), ('d3', 'd1', d3, d1), ('d3', 'd2', d3, d2)]):
    print(f'Starting run {idx}')
    for model in ['catb', 'xgb']:
        param_results:pd.DataFrame = get_topn_param_sets(PARAMS_PATH, algo=model, dataset=tr_ds_name, n=10, sort_condition='f1_macro_mean')
        param_sets = param_results['params']

        snowballer = SnowballStemmer('english')     
        def pp_SnowballStemmer(text):
            return ' '.join([snowballer.stem(word) for word in text.split()])
        
        for i, params in enumerate(param_sets):
            x_train, x_val, y_train, y_val = train_test_split(
                tr_df['text'],
                tr_df['label'],
                test_size=0.1,
                shuffle=True,
                stratify=tr_df['label'],
                random_state=RANDOM_SEED
            )
            x_test = te_df['text']
            vectorizer_cls = CountVectorizer if 'Count' in params['vectorizer'] else TfidfVectorizer
            vect_params = {}

            for k in params.keys():
                if 'vectorizer__' in k:
                    p_name = k.split('__')[1]            
                    if p_name == 'ngram_range':
                        vect_params[p_name] = tuple(params[k])  
                    elif p_name == 'preprocessor':
                        vect_params[p_name] = pp_SnowballStemmer if isinstance(params[k], str) else None
                    else:
                        vect_params[p_name] = params[k]

            vect1 = vectorizer_cls(**vect_params)
            
                    
            print("Get OOV ratio")

            myv = MyVectorizer(**vect_params)
            myv.fit_transform(x_train)
            myv.transform(x_test)

            total = myv.oov_token_count + myv.iv_token_count 
            ratio = myv.oov_token_count / total

            results.append(ratio)

with open('oov_ratios.json', 'w') as f:
    json.dump(results, f)

Starting run 0
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Starting run 1
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Starting run 2
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Starting run 3
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Get OOV ratio
Ge

In [93]:
with open('oov_ratios.json', 'r') as f:
    print(len(json.load(f)))

120
