In [None]:
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, cohen_kappa_score, precision_score, recall_score, \
    precision_recall_curve

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from tqdm import tqdm

import os
from gensim.models import KeyedVectors
from gensim.downloader import base_dir

In [None]:
import sys
!{sys.executable} -m pip install pandas

# Select your output and input Path

In [None]:
output_path="/home/VolunteerismTransfe/out/model/fasttext/weaksupervison/"
base_path="/home/VolunteerismTransfer/data/"

# Load Data
We have 5 Different Inputs:
    Labeled data ,
    Labeled data + NGO accounts(labeled as  "volunteer"),
    Labeled data + NGO accounts(labeled by label prop ),
    Labeled data + random Data(labeled by label prop),
    Labeled data + NGO (labeled by label prop) + random Data(labeled by label prop)

In [None]:
data = pd.read_json(base_path+'data/FT_Labeled.json', orient='records', lines=True)

# Get hierarchical Event Category

In [None]:
def get_sample_weight(event_type, target):

    if event_type == target:
        return 10
    elif get_sample_category(event_type) == get_sample_category(target):
        return 6
    elif get_sample_category(event_type)=='general':
        return 1
    else:
        return 3

In [None]:
def get_sample_category(event_type):
    if 'earthquake' in event_type or "hurricane/typhoon/cyclone/tornado" in event_type or "flood" in event_type or "wildfire/bushfire" in event_type or "outbreak" in event_type:
        return 'natural'
    elif "bombing" in event_type or "shooting" in event_type or "explosion" in event_type or "collapse" in event_type:
        return "manmade"
    else:
        return "general"


# Train SVM Model 

In [None]:
def train_model(X_train,Y_train,sample_weight=None):
    x=[]
    y=[]
    for index, value in X_train.items():
        x.append(value)
        y.append(Y_train[index])
    pipeline_sgd = Pipeline([
    ('nb', SGDClassifier(loss='hinge')),
    ])
    model=pipeline_sgd.fit(x, y,nb__sample_weight=sample_weight)

    return model

In [None]:
# Specify how to split training and test data by chooisng value from ['eventid','event_type']
groupby_col='eventid'

# Specify Rebalancing strategy from ['none,'up','down','up-with-same-eventtype','up-with-same-eventCategory']
sampling_strategy='up' 

#Specify if to use up-weighting samples from the same event type as the held-out event or not
up_weighting=True

events=data.groupby([groupby_col]).groups.keys()
result=pd.DataFrame(columns=[groupby_col,'src','precision', 'recall', 'f1_score'])
skip=""

In [None]:
for event in tqdm(events):
    if event == 'general':
        continue
    training=data[data[groupby_col] != event]
    test = data[data[groupby_col] == event]

    if up_weighting == True:
        training['sample_weight'] = 100 * np.abs(np.random.randn(training.shape[0]))
        # same_event_data = training[training.event_type == test.iloc[0].event_type]
        training['sample_weight'] = np.where(training['event_type'] ==  test.iloc[0].event_type,
                                training['sample_weight'] * 10,
                                training['sample_weight'])

    vol = training.loc[training.label == 1]
    non_vol = training.loc[training.label == 0]

    # Equally sample 'pos' and 'neg' with replacement and concatenate into a dataframe.
    if sampling_strategy == 'up':
        training = non_vol.append(vol.sample(n=len(non_vol), replace=True), ignore_index=True)

    elif sampling_strategy == 'down':
        training = vol.append(non_vol.sample(n=len(vol), replace=True), ignore_index=True)

    elif sampling_strategy== "up-with-same-eventtype":
        training['sample_weight'] = 1
        training['sample_weight'] = np.where(training['event_type'] == test.iloc[0].event_type,
                                                 training['sample_weight'] * 10,
                                                 training['sample_weight'])
        vol = training.loc[training.label == 1]
        non_vol = training.loc[training.label == 0]
        training = non_vol.append(vol.sample(n=len(non_vol),weights='sample_weight', replace=True), ignore_index=True)

    elif sampling_strategy== "up-with-same-eventCategory":
        training['sample_weight'] = [get_sample_weight(x,test.iloc[0].event_type) for x in training['event_type']]
        vol = training.loc[training.label == 1]
        non_vol = training.loc[training.label == 0]
        training = non_vol.append(vol.sample(n=len(non_vol),weights='sample_weight', replace=True), ignore_index=True)



    if training.shape[0]== 0:
        print(event)
        recall = 0
        precision = 0
        f1_score = 0
    else:
        X_train =training['ft_features']
        X_test = test['ft_features']
        y_train = training['label']
        y_test = test['label']
        if up_weighting== True:
            model = train_model(X_train, y_train,sample_weight=training['sample_weight'])
        else:
            model = train_model(X_train,y_train)


        x=[]
        y=[]
        for index, value in X_test.items():
            x.append(value)
        y_predict = model.predict(x)
        recall =  recall_score(y_test, y_predict)
        precision = precision_score(y_test, y_predict)
        # roc = roc_auc_score(y_test, y_predict)
        from sklearn.metrics import f1_score
        f1_score=f1_score(y_test, y_predict)
    result = result.append({groupby_col: event,'src': test.iloc[0].src, 'precision': precision, 'recall': recall, 'f1_score': f1_score}, ignore_index=True)


# Save Result with proper filename

In [None]:
up_weight_label='_upweight' if up_weighting== True else "
fileName="labeled + NGO _"
filepath=output_path+fileName+groupby_col+"_sampling_strategy-"+sampling_strategy+up_weight_label
print(skip)
result.to_csv(filepath+".csv")
# result.to_json(filepath+".json", orient='records', lines=True)