In [1]:
import pandas as pd
import sys
sys.path.append('../../')
from config import Config
from experiments_utils import *
import pickle as pkl
from IPython.display import display, HTML

## Read CSV containing comments and features from all datasets

In [2]:
training_feats_file = Config.ALL_FEATURES_FILE_PATH

In [3]:
training_feats_df = pd.read_csv(training_feats_file)

In [4]:
training_feats_df['comment_len'] = training_feats_df['pp_comment_text'].apply(lambda x: len(x.split()))

## SOCC_df contains instances of annotated SOCC with the new annotation scheme

In [5]:
SOCC_df = training_feats_df[training_feats_df['source'] == 'SOCC']

## all_SOCC_df contains all instances of annotated SOCC 

In [6]:
all_SOCC_df = training_feats_df[training_feats_df['source'].str.endswith('SOCC')]

## Feature sets 

In [7]:
text_feats = ['text_feats']

len_dependent_feats = ['length_feats',
             'argumentation_feats',
             'COMMENTIQ_feats',
             'named_entity_feats']

crowd_annotated_feats = ['constructiveness_chars_feats',
                         'non_constructiveness_chars_feats',
                         'toxicity_chars_feats']

perspective_feats = ['perspective_content_value_feats',
                     'perspective_aggressiveness_feats',
                     'perspecitive_toxicity_feats']

all_feats =  text_feats + len_dependent_feats + perspective_feats 

In [8]:
def pretty_print_results(avg_results_dict):
    for (test_subset, res) in avg_results_dict.items():
        raw_html = '<h2>' + test_subset + '</h2>'
        display(HTML(raw_html))
        df = pd.DataFrame.from_dict(res, orient='index')
                           #,columns=['Recall', 'Precision', 'F-score', 'Dummy'])
        display(HTML(df.to_html()))

## only length dependent feats

In [9]:
feature_set = len_dependent_feats
avg_results_len_dict = run_n_experiments(all_SOCC_df, feature_set) 


-----------------------------
EXPERIMENT:  1
-----------------------------

Training samples:  10408
Test samples:  2627

TRAIN SET DISTRIBUTIONS: 
Distribution in balanced samples: 
Size of the data:  4689 	Constructive ( 2795 ) 	Non constructive ( 1894 )
None
Distribution in hard samples: 
Size of the data:  1201 	Constructive ( 1014 ) 	Non constructive ( 187 )
None

TEST SET DISTRIBUTIONS: 
Distribution in balanced samples: 
Size of the data:  1254 	Constructive ( 760 ) 	Non constructive ( 494 )
None
Distribution in hard samples: 
Size of the data:  324 	Constructive ( 278 ) 	Non constructive ( 46 )
None

TRAINING...

Classifier:  SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=50, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=0.0001, verbose=0, warm_start=False)
Feature set:  ['length_feats', 'argu

Model trained and pickled in file:  /home/vkolhatk/data/Constructiveness_public/models/tmp_model.pkl

TESTING ON ALL TEST SAMPLES

Size of the data:  2646 	Constructive ( 1335 ) 	Non constructive ( 1311 )
Accuracy:  0.9176114890400605
Precision-recall for each class:  (array([0.93135011, 0.90411985]), array([0.9051149, 0.9306091]), array([0.91804511, 0.91717325]), array([1349, 1297]))
                  precision    recall  f1-score   support

non-constructive       0.91      0.93      0.92      1311
    constructive       0.93      0.90      0.92      1335

     avg / total       0.92      0.92      0.92      2646

Results: 
micro_average => (0.9176114890400605, 0.9176114890400605, 0.9176114890400605, None)
weighted_average => (0.9180025510357935, 0.9176114890400605, 0.9176177495652095, None)
macro_average => (0.917734982301871, 0.917861998922072, 0.9176091825307952, None)
<class 'dict'>

TESTING ON BALANCED TEST SAMPLES

Size of the data:  1244 	Constructive ( 766 ) 	Non constructive 

In [10]:
pretty_print_results(avg_results_len_dict)

Unnamed: 0,mean_P,mean_F1,mean_R
macro_average,0.912945,0.912823,0.912908
micro_average,0.912838,0.912838,0.912838
weighted_average,0.912838,0.912839,0.913049


Unnamed: 0,mean_P,mean_F1,mean_R
macro_average,0.827094,0.828634,0.830972
micro_average,0.835404,0.835404,0.835404
weighted_average,0.835404,0.83494,0.835226


Unnamed: 0,mean_P,mean_F1,mean_R
macro_average,0.369758,0.311697,0.27182
micro_average,0.449106,0.449106,0.449106
weighted_average,0.449106,0.378634,0.329482


## only perspective features

In [11]:
feature_set = perspective_feats
avg_results_perspective_dict = run_n_experiments(all_SOCC_df, feature_set) 


-----------------------------
EXPERIMENT:  1
-----------------------------

Training samples:  10423
Test samples:  2612

TRAIN SET DISTRIBUTIONS: 
Distribution in balanced samples: 
Size of the data:  4751 	Constructive ( 2819 ) 	Non constructive ( 1932 )
None
Distribution in hard samples: 
Size of the data:  1210 	Constructive ( 1012 ) 	Non constructive ( 198 )
None

TEST SET DISTRIBUTIONS: 
Distribution in balanced samples: 
Size of the data:  1192 	Constructive ( 736 ) 	Non constructive ( 456 )
None
Distribution in hard samples: 
Size of the data:  315 	Constructive ( 280 ) 	Non constructive ( 35 )
None

TRAINING...

Classifier:  SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=50, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=0.0001, verbose=0, warm_start=False)
Feature set:  ['perspective_content_

Size of the data:  4748 	Constructive ( 2823 ) 	Non constructive ( 1925 )
None
Distribution in hard samples: 
Size of the data:  1222 	Constructive ( 1040 ) 	Non constructive ( 182 )
None

TEST SET DISTRIBUTIONS: 
Distribution in balanced samples: 
Size of the data:  1195 	Constructive ( 732 ) 	Non constructive ( 463 )
None
Distribution in hard samples: 
Size of the data:  303 	Constructive ( 252 ) 	Non constructive ( 51 )
None

TRAINING...

Classifier:  SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=50, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=0.0001, verbose=0, warm_start=False)
Feature set:  ['perspective_content_value_feats', 'perspective_aggressiveness_feats', 'perspecitive_toxicity_feats']
Size of the training data:  10439 	Constructive ( 5180 ) 	Non constructive ( 5259 )
Model trained and p

In [12]:
pretty_print_results(avg_results_perspective_dict)

Unnamed: 0,mean_P,mean_F1,mean_R
macro_average,0.8795,0.878143,0.878556
micro_average,0.878214,0.878214,0.878214
weighted_average,0.878214,0.878288,0.880127


Unnamed: 0,mean_P,mean_F1,mean_R
macro_average,0.76289,0.751203,0.745921
micro_average,0.769015,0.769015,0.769015
weighted_average,0.769015,0.772883,0.782276


Unnamed: 0,mean_P,mean_F1,mean_R
macro_average,0.463623,0.438619,0.441018
micro_average,0.610181,0.610181,0.610181
weighted_average,0.610181,0.562804,0.538478


## only text features

In [13]:
feature_set = text_feats
avg_results_text_dict = run_n_experiments(all_SOCC_df, feature_set)


-----------------------------
EXPERIMENT:  1
-----------------------------

Training samples:  10430
Test samples:  2605

TRAIN SET DISTRIBUTIONS: 
Distribution in balanced samples: 
Size of the data:  4786 	Constructive ( 2837 ) 	Non constructive ( 1949 )
None
Distribution in hard samples: 
Size of the data:  1222 	Constructive ( 1031 ) 	Non constructive ( 191 )
None

TEST SET DISTRIBUTIONS: 
Distribution in balanced samples: 
Size of the data:  1157 	Constructive ( 718 ) 	Non constructive ( 439 )
None
Distribution in hard samples: 
Size of the data:  303 	Constructive ( 261 ) 	Non constructive ( 42 )
None

TRAINING...

Classifier:  SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=50, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=0.0001, verbose=0, warm_start=False)
Feature set:  ['text_feats']
Size of

Model trained and pickled in file:  /home/vkolhatk/data/Constructiveness_public/models/tmp_model.pkl

TESTING ON ALL TEST SAMPLES

Size of the data:  2597 	Constructive ( 1268 ) 	Non constructive ( 1329 )
Accuracy:  0.7443203696572969
Precision-recall for each class:  (array([0.5131678 , 0.98659306]), array([0.97567954, 0.65911486]), array([0.67258383, 0.79027164]), array([ 699, 1898]))
                  precision    recall  f1-score   support

non-constructive       0.98      0.51      0.67      1329
    constructive       0.66      0.99      0.79      1268

     avg / total       0.82      0.74      0.73      2597

Results: 
micro_average => (0.7443203696572969, 0.7443203696572969, 0.7443203696572969, None)
weighted_average => (0.8591674688869095, 0.7443203696572969, 0.7585951713734086, None)
macro_average => (0.7498804276358734, 0.817397199974071, 0.7314277312819516, None)
<class 'dict'>

TESTING ON BALANCED TEST SAMPLES

Size of the data:  1143 	Constructive ( 675 ) 	Non constructi

Accuracy:  0.7657342657342657
Precision-recall for each class:  (array([0.        , 0.93191489]), array([0.        , 0.81111111]), array([0.        , 0.86732673]), array([ 16, 270]))
                  precision    recall  f1-score   support

non-constructive       0.00      0.00      0.00        51
    constructive       0.81      0.93      0.87       235

     avg / total       0.67      0.77      0.71       286

Results: 
micro_average => (0.7657342657342657, 0.7657342657342657, 0.7657342657342657, None)
weighted_average => (0.8797797946734117, 0.7657342657342657, 0.8188049574188186, None)
macro_average => (0.46595744680851064, 0.40555555555555556, 0.4336633663366336, None)
<class 'dict'>


In [None]:
pretty_print_results(avg_results_text_dict)

Unnamed: 0,mean_P,mean_F1,mean_R
macro_average,0.818301,0.737847,0.754739
micro_average,0.749786,0.749786,0.749786
weighted_average,0.749786,0.762913,0.857338


Unnamed: 0,mean_P,mean_F1,mean_R
macro_average,0.728231,0.479447,0.548883
micro_average,0.630866,0.630866,0.630866
weighted_average,0.630866,0.72942,0.937047


Unnamed: 0,mean_P,mean_F1,mean_R
macro_average,0.41701,0.439046,0.463729
micro_average,0.782784,0.782784,0.782784
weighted_average,0.782784,0.824272,0.870753


## only non length features

In [None]:
feature_set = text_feats + perspective_feats
avg_results_non_len_dict = run_n_experiments(all_SOCC_df, feature_set) 


-----------------------------
EXPERIMENT:  1
-----------------------------

Training samples:  10437
Test samples:  2598

TRAIN SET DISTRIBUTIONS: 
Distribution in balanced samples: 
Size of the data:  4720 	Constructive ( 2834 ) 	Non constructive ( 1886 )
None
Distribution in hard samples: 
Size of the data:  1202 	Constructive ( 1015 ) 	Non constructive ( 187 )
None

TEST SET DISTRIBUTIONS: 
Distribution in balanced samples: 
Size of the data:  1223 	Constructive ( 721 ) 	Non constructive ( 502 )
None
Distribution in hard samples: 
Size of the data:  323 	Constructive ( 277 ) 	Non constructive ( 46 )
None

TRAINING...

Classifier:  SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=50, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=0.0001, verbose=0, warm_start=False)
Feature set:  ['text_feats', 'perspe

Model trained and pickled in file:  /home/vkolhatk/data/Constructiveness_public/models/tmp_model.pkl

TESTING ON ALL TEST SAMPLES

Size of the data:  2634 	Constructive ( 1319 ) 	Non constructive ( 1315 )
Accuracy:  0.880030372057707
Precision-recall for each class:  (array([0.84790875, 0.91205459]), array([0.90576767, 0.85744833]), array([0.87588374, 0.88390889]), array([1231, 1403]))
                  precision    recall  f1-score   support

non-constructive       0.91      0.85      0.88      1315
    constructive       0.86      0.91      0.88      1319

     avg / total       0.88      0.88      0.88      2634

Results: 
micro_average => (0.880030372057707, 0.880030372057707, 0.880030372057707, None)
weighted_average => (0.8820760253193347, 0.880030372057707, 0.8801583357462275, None)
macro_average => (0.8799816660276683, 0.8816079967899818, 0.8798963148602091, None)
<class 'dict'>

TESTING ON BALANCED TEST SAMPLES

Size of the data:  1179 	Constructive ( 715 ) 	Non constructive (

In [None]:
pretty_print_results(avg_results_non_len_dict)

## all features

In [None]:
feature_set = all_feats
avg_results_all_dict = run_n_experiments(all_SOCC_df, feature_set) 

In [None]:
pretty_print_results(avg_results_all_dict)

In [None]:
#train_set = create_numeric_representation_of_text_and_labels(test_set_df,
#                                                             text_col = 'pp_comment_text',
#                                                             target_col = 'constructive')
#trainX, trainY, validationX, validationY = get_preprocessed_and_padded_train_validation_splits(train_set)


In [None]:
#bilstm_classifier = BiLSTMConstructivenessClassifier(mode = 'test', model_path = Config.BILSTM_MODEL_PATH)

In [None]:
#bilstm_classifier.predict(trainX, trainY)