In [1]:
import pandas as pd
import sys
sys.path.append('../../')
from config import Config
from experiments_utils import *

In [2]:
training_feats_file = Config.ALL_FEATURES_FILE_PATH

In [3]:
training_feats_df = pd.read_csv(training_feats_file)

In [4]:
data_sources = ['SOCC', 
                'NYTPicks+YNACC', 
                'SOCC+NYTPicks+YNACC', 
                'SOCC+NYTPicks'
               ]        

In [7]:
crowd_annotated_feature_set = ['constructiveness_chars_feats',
                               'non_constructiveness_chars_feats',
                               'toxicity_chars_feats']

automatically_extracted_feature_set = [#'text_feats', 
                                       'length_feats',
                                       'argumentation_feats',
                                       'COMMENTIQ_feats',
                                       'named_entity_feats']

perspective_feature_set = ['perspective_content_value_feats',
                           'perspective_aggressiveness_feats',
                           'perspecitive_toxicity_feats'    
                          ]

In [8]:
for data_source in data_sources:    
    print('----------------------')         
    print('DATA SOURCE: ', data_source)     
    print('----------------------')        

    # If the dataset is not SOCC, do not include crowd-annotated features 
    if data_source == 'SOCC':
        feature_set = crowd_annotated_feature_set + automatically_extracted_feature_set + perspective_feature_set
    else:
        feature_set = automatically_extracted_feature_set + perspective_feature_set
    
    print('CROSS VALIDATION EXPERIMENTS: ')
    sources = data_source.split('+')
    
    if data_source == 'SOCC+NYTPicks':
        # sample negative examples from SOCC and the same number of +ve examples from NYTPicks
        subset_df = training_feats_df[training_feats_df['source'].isin(sources)]
        SOCC_neg_df = subset_df[(subset_df['source'] == 'SOCC') & (subset_df['constructive'] == 0)]
        NYTPicks_df = subset_df[(subset_df['source'] == 'NYTPicks')]                        
        NYTPicks_df_sample = NYTPicks_df.sample(n = SOCC_neg_df.shape[0])
        train_df = pd.concat([SOCC_neg_df, NYTPicks_df_sample])    
    else:             
        train_df = training_feats_df[training_feats_df['source'].isin(sources)]        
    
    results = run_cross_validation_experiments(train_df, feature_set)    
    print('\nCross-validation results: ',)    
    for key, val in results.items():
        print(key, '=>', val)
    print('----------------------')    

    save_path = data_source + '.png'
    
    print('TRAIN SIZE EXPERIMENTS: ')
    #run_training_size_experiments(train_df, feature_set)

    print('\n----------------------------\n')          

----------------------
DATA SOURCE:  SOCC
----------------------
CROSS VALIDATION EXPERIMENTS: 
Size of the training data:  12000 	Constructive ( 5906 ) 	Non constructive ( 6094 )
Cross validation folds:  10
Classifier:  SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=50, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=0.0001, verbose=0, warm_start=False)
Feature set:  ['constructiveness_chars_feats', 'non_constructiveness_chars_feats', 'toxicity_chars_feats', 'text_feats', 'length_feats', 'argumentation_feats', 'COMMENTIQ_feats', 'named_entity_feats', 'perspective_content_value_feats', 'perspective_aggressiveness_feats', 'perspecitive_toxicity_feats']

Cross-validation results: 
variance => 3.767298979762616e-05
scores => [0.92857143 0.93929174 0.92527288 0.92917369 0.91794872 0.93887946
 0.93548387 0.92

KeyboardInterrupt: 