In [22]:
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, cohen_kappa_score, precision_score, recall_score, \
    precision_recall_curve

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from tqdm import tqdm
import multiprocessing 
from time import sleep
import os
from gensim.models import KeyedVectors
from gensim.downloader import base_dir

In [23]:
import sys
!{sys.executable} -m pip install multiprocessing

Collecting multiprocessing
  Using cached https://files.pythonhosted.org/packages/b8/8a/38187040f36cec8f98968502992dca9b00cc5e88553e01884ba29cbe6aac/multiprocessing-2.6.2.1.tar.gz
    Complete output from command python setup.py egg_info:
    Traceback (most recent call last):
      File "<string>", line 1, in <module>
      File "/tmp/pip-install-q7s30hnr/multiprocessing/setup.py", line 94
        print 'Macros:'
                      ^
    SyntaxError: Missing parentheses in call to 'print'. Did you mean print('Macros:')?
    
    ----------------------------------------
[31mCommand "python setup.py egg_info" failed with error code 1 in /tmp/pip-install-q7s30hnr/multiprocessing/[0m
[33mYou are using pip version 10.0.1, however version 20.3b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


# Select your output and input Path

In [24]:
output_path="/home/alip/PycharmProjects/VolunteerismTransfer/data/new/result/weak-supervison/new/"
base_path="/home/alip/PycharmProjects/VolunteerismTransfer/data/new/"

# Load Data
We have 5 Different Inputs:
    Labeled data ,
    Labeled data + NGO accounts(labeled as  "volunteer"),
    Labeled data + NGO accounts(labeled by label prop ),
    Labeled data + random Data(labeled by label prop),
    Labeled data + NGO (labeled by label prop) + random Data(labeled by label prop)

In [25]:
data = pd.read_json(base_path+'data/new/PR_NGO+Labeled Data.json', orient='records', lines=True)

# Get hierarchical Event Category

In [13]:
def get_sample_weight(event_type, target):

    if event_type == target:
        return 10
    elif get_sample_category(event_type) == get_sample_category(target):
        return 6
    elif get_sample_category(event_type)=='general':
        return 1
    else:
        return 3

In [14]:
def get_sample_category(event_type):
    if 'earthquake' in event_type or "hurricane/typhoon/cyclone/tornado" in event_type or "flood" in event_type or "wildfire/bushfire" in event_type or "outbreak" in event_type:
        return 'natural'
    elif "bombing" in event_type or "shooting" in event_type or "explosion" in event_type or "collapse" in event_type:
        return "manmade"
    else:
        return "general"


In [15]:
pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SGDClassifier(loss='hinge')),
])

# Train SVM Model 

In [19]:
def get_different_sampling_strategy(data,seed,heldout_event):
    performance_result={}  
    sampling_strategies=['none','up','down','up-with-same-eventtype','up-with-same-eventCategory']
    for sampling_strategy in tqdm(sampling_strategies):
        if sampling_strategy== 'none' or sampling_strategy=='up' or sampling_strategy=='down':
            performance_result[sampling_strategy+'-without-upweight']=evaluate_model(data,heldout_event,sampling_strategy,up_weighting=False,seed_number=seed)
            performance_result[sampling_strategy+'-with-upweight']=evaluate_model(data,heldout_event,sampling_strategy,up_weighting=True,seed_number=seed)
        else:
            performance_result[sampling_strategy]=evaluate_model(data,heldout_event,sampling_strategy,up_weighting=False,seed_number=seed)
    return performance_result

In [20]:
def evaluate_model(data,heldout_event, sampling_strategy,up_weighting,seed_number,label_spreading=False):
    #Decide with field to use for training as a gold-label
    if label_spreading== True:
        lab='label_spread'
    else:
        lab='label'
    
    # Split Test and trainig data based on heldout_event/heldout_eventType  and get the resample training data     
    main_training=data[data[groupby_col] != heldout_event]
    test = data[data[groupby_col] == heldout_event]  
    training=get_sample_data(main_training,seed_number,label_spreading)
        
    #Randomely give weights to each sample,then set higher weights to event of same-type
    if up_weighting == True:
        training['sample_weight'] = 100 * np.abs(np.random.randn(training.shape[0]))
        training['sample_weight'] = np.where(training['event_type'] ==  test.iloc[0].event_type,
                                training['sample_weight'] * 10,
                                training['sample_weight'])
    #Split training data to volunteer and non-volunterrs for further use in applying sampling strategies
    vol = training.loc[training[lab] == 1]
    non_vol = training.loc[training[lab] == 0]
    
    #Upsampling Strategy
    if sampling_strategy == 'up':
        training = non_vol.append(vol.sample(n=len(non_vol), replace=True), ignore_index=True)

    #Downsampling Strategy
    elif sampling_strategy == 'down':
        training = vol.append(non_vol.sample(n=len(vol), replace=False), ignore_index=True)

    #when up-sampling, resample primarily from events of the same type
    elif sampling_strategy== "up-with-same-eventtype":
        training['sample_weight'] = 1
        training['sample_weight'] = np.where(training['event_type'] == test.iloc[0].event_type,
                                                 training['sample_weight'] * 10,
                                                 training['sample_weight'])
        vol = training.loc[training[lab] == 1]
        non_vol = training.loc[training[lab] == 0]
        training = non_vol.append(vol.sample(n=len(non_vol),weights='sample_weight', replace=True), ignore_index=True)
    #when up-sampling, resample primarily from events of the same “kind” of event (manmade vs. natural) 
    elif sampling_strategy== "up-with-same-eventCategory":
        training['sample_weight'] = [get_sample_weight(x,test.iloc[0].event_type) for x in training['event_type']]
        vol = training.loc[training[lab] == 1]
        non_vol = training.loc[training[lab] == 0]
        training = non_vol.append(vol.sample(n=len(non_vol),weights='sample_weight', replace=True), ignore_index=True)


    X_train =[str(x) for x in training['processed_text']]
    X_test = [str(x) for x in test['processed_text']]
    y_train = training[lab]
    y_test = test['label']
    if up_weighting== True:
        model = pipeline_sgd.fit(X_train, y_train,nb__sample_weight=training['sample_weight'])
    else:
        model = pipeline_sgd.fit(X_train, y_train)


    y_predict = model.predict(X_test)
    recall =  recall_score(y_test, y_predict,zero_division=0)
    precision = precision_score(y_test, y_predict,zero_division=0)
    from sklearn.metrics import f1_score
    f1_score=f1_score(y_test, y_predict,zero_division=0)
    return {groupby_col: test.iloc[0].eventid+'-'+str(seed_number),'src': test.iloc[0].src, 'precision': precision, 'recall': recall, 'f1_score': f1_score}
    

In [21]:
def get_sample_data(data, seed_number,label_spreading=False):
        
    labeled_data = data.query("src == 'trec' or src == 'crisis_nlp'")
    other = data.query("src != 'trec' and src != 'crisis_nlp'")
    N=len(labeled_data)
    sample_data=labeled_data.sample(n=N, random_state=seed_number, replace=True)
    sample_data=sample_data.append(other)
    if label_spreading==True:
        sample_data['ft_features'] = [vectorize(str(s)) for s in sample_data["processed_text"]]
        sample_data['l']=sample_data.apply(lambda row: row['ft_features'].size, axis=1)
        sample_data=sample_data.query("l >1")
        sample_data.drop( columns=['l'],inplace=True)
        X_data = sample_data['ft_features']
        y_labels = sample_data['label'].copy()
        y_labels[sample_data['src'].isin(['top-accounts', 'random'])] = -1
        label_prop_model = LabelSpreading(kernel='knn', n_jobs=-1)
        y_learned = label_prop_model.fit(X_data.tolist(), y_labels).transduction_
        sample_data["label_spread"] = y_learned
        sample_data=sample_data.query("src == 'trec' or src=='crisis_nlp' or ((src=='top-accounts' or src=='random') and label_spread==1 )")
    return sample_data 
    
    

In [None]:
import gensim.downloader as api
print(api.load('glove-twitter-200', return_path=True))

In [None]:
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, cohen_kappa_score, precision_score, recall_score, \
    precision_recall_curve

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from tqdm import tqdm

import os
from gensim.models import KeyedVectors
from gensim.downloader import base_dir

In [None]:
path = os.path.join(base_dir, 'glove-twitter-100', 'glove-twitter-100.gz')
model_gensim = KeyedVectors.load_word2vec_format(path)

wvs = model_gensim.wv

In [None]:
vectorizer = TfidfVectorizer(
    use_idf=True,
    smooth_idf=False,
    norm=None,  # Applies l2 norm smoothing
    decode_error='replace',
    max_features=10000,
    min_df=4,
    max_df=0.501
)

analyzer = vectorizer.build_analyzer()

In [None]:
def vectorize(sentence):

    tokenized = [t for t in analyzer(sentence)]

    wv_vecs = []
    for t in tokenized:

        try:
            v = wvs[t]
            norm = np.linalg.norm(v)
            normed_v = (v / norm)
            wv_vecs.append(normed_v)
        except:
            continue

    m = np.array(wv_vecs)
    normed_m = np.mean(m, axis=0)

    normed_m = np.nan_to_num(normed_m)


    return normed_m



In [None]:
manager = multiprocessing.Manager()

# Define a list (queue) for tasks and computation results
tasks = manager.Queue()
results = manager.Queue()
num_processes = 13
pool = multiprocessing.Pool(processes=num_processes)
processes = []
for event in events:

    if test.iloc[0].src=='trec' and test.shape[0] > 30:
        # Set process name
        process_name = event

        # Create the process, and connect it to the worker function
        for seed in seeds:
            new_process = multiprocessing.Process(target=get_different_sampling_strategy, args=((data,seed_number=seed,heldout_event=event))

        # Add new process to the list of processes
        processes.append(new_process)

        # Start the process
        new_process.start('spawn')

In [32]:

# Specify how to split training and test data by chooisng value from ['eventid','event_type']
#Only consider events with more than/equal to 30 volunteer tweets as a held-out event
groupby_col='eventid'
seeds=np.arange(1,21)
all_events=data.groupby([groupby_col]).groups.keys()
events=[]
for event in tqdm(all_events):
    training=data[data['eventid'] == event]
    if training.iloc[0].src != 'trec':
        continue;
    vol=training.query("label == 1 ")
    if vol.shape[0]>=30:
               events.append(event)
               


100%|██████████| 62/62 [00:00<00:00, 80.78it/s]


In [None]:
#return dictiony contains result for applying different sampling strategies on sample of training data with initail seed 
#Key of the dictionary is the sampling strategy
#this should be called  number of heldout_event * number of seeds
result=get_different_sampling_strategy(data,seed_number=1,heldout_event='philippinesEarthquake2019')

In [None]:
#return dictiony contains result for sample of training data with initail seed and specified sampling strategy
#Label Spreading is applied if  label_spreading is set to True otherwise it is false by default
result=evaluate_model(data,heldout_event='philippinesEarthquake2019',sampling_strategy='up',up_weighting=True,seed_number=1,label_spreading=True)