In [1]:
import pandas as pd
import sys
sys.path.append('../../')
from config import Config
from experiments_utils import *

# Experiments with feature sets 

# Find correlation of each feature column with the target column 

In [2]:
training_feats_file = Config.ALL_FEATURES_FILE_PATH
training_feats_df = pd.read_csv(training_feats_file)
SOCC_df = training_feats_df[training_feats_df['source'].isin(['SOCC'])]

In [3]:
annotation_cols = ['constructive']
annotation_df = SOCC_df[annotation_cols]

# Correlation with length features

In [4]:
length_feats_cols = ['length',
                      'average_word_length',
                      'nSents', 
                      'avg_words_per_sent'
                     ]

In [5]:
get_corr_df(SOCC_df, length_feats_cols, annotation_cols)

Unnamed: 0,constructive
length,0.622955
average_word_length,-0.041142
nSents,0.533597
avg_words_per_sent,0.350866
constructive,1.0


# Correlation with argumentation features

In [6]:
argumentation_feats_cols = ['has_conjunctions_and_connectives',
                            'has_stance_adverbials', 
                            'has_reasoning_verbs', 
                            'has_modals', 
                            'has_shell_nouns']

In [7]:
get_corr_df(SOCC_df, argumentation_feats_cols, annotation_cols)

Unnamed: 0,constructive
has_conjunctions_and_connectives,0.246693
has_stance_adverbials,0.30445
has_reasoning_verbs,0.386248
has_modals,0.414188
has_shell_nouns,0.331579
constructive,1.0


# Correlation with COMMENTIQ features

In [8]:
COMMENTIQ_feats_cols = ['readability_score', 
                        'personal_exp_score']                                     

In [9]:
get_corr_df(SOCC_df, COMMENTIQ_feats_cols, annotation_cols)

Unnamed: 0,constructive
readability_score,0.721989
personal_exp_score,-0.025497
constructive,1.0


# Correlation with named-entity features

In [10]:
named_entity_feats_cols = ['named_entity_count']

In [11]:
get_corr_df(SOCC_df, named_entity_feats_cols, annotation_cols)

Unnamed: 0,constructive
named_entity_count,0.52895
constructive,1.0


# Correlation with crowd-annotated constructiveness characteristics

In [12]:
constructiveness_chars_feats_cols = ['specific_points', 
                                     'dialogue', 
                                     'evidence', 
                                     'personal_story', 
                                     'solution', 
                                     'no_con']

In [13]:
get_corr_df(SOCC_df, constructiveness_chars_feats_cols, annotation_cols)

Unnamed: 0,constructive
specific_points,0.430854
dialogue,0.416232
evidence,0.348249
personal_story,0.192803
solution,0.400866
no_con,-0.642889
constructive,1.0


# Correlation with crowd-annotated non-constructiveness characteristics

In [14]:
non_constructiveness_chars_feats = ['no_respect', 
                                    'provocative', 
                                    'sarcastic', 
                                    'non_relevant', 
                                    'unsubstantial', 
                                    'no_non_con']

In [15]:
get_corr_df(SOCC_df, non_constructiveness_chars_feats, annotation_cols)

Unnamed: 0,constructive
no_respect,-0.24679
provocative,-0.120312
sarcastic,-0.097141
non_relevant,-0.430287
unsubstantial,-0.465158
no_non_con,0.288171
constructive,1.0


# Correlation with crowd-annotated toxicity characteristics

In [16]:
toxicity_chars_feats = ['personal_attack', 
                        'teasing', 
                        'abusive', 
                        'embarrassment', 
                        'inflammatory', 
                        'no_toxic']

In [17]:
get_corr_df(SOCC_df, toxicity_chars_feats, annotation_cols)

Unnamed: 0,constructive
personal_attack,-0.032756
teasing,-0.042614
abusive,-0.037423
embarrassment,-0.008899
inflammatory,-0.01075
no_toxic,0.032251
constructive,1.0


# Correlation with perspective content value features

In [18]:
            
perspective_content_value_feats = ['OFF_TOPIC:probability', 
                                   'SPAM:probability',
                                   'UNSUBSTANTIAL:probability',
                                   'INCOHERENT:probability'
                                  ]

In [19]:
get_corr_df(SOCC_df, perspective_content_value_feats, annotation_cols)

Unnamed: 0,constructive
OFF_TOPIC:probability,-0.291174
SPAM:probability,0.060843
UNSUBSTANTIAL:probability,-0.737758
INCOHERENT:probability,-0.147813
constructive,1.0


# Correlation with perspective agressiveness features

In [20]:
perspective_aggressiveness_feats = ['ATTACK_ON_AUTHOR:probability',
            'ATTACK_ON_COMMENTER:probability', 'ATTACK_ON_PUBLISHER:probability',
    
]

In [21]:
get_corr_df(SOCC_df, perspective_aggressiveness_feats, annotation_cols)

Unnamed: 0,constructive
ATTACK_ON_AUTHOR:probability,-0.060131
ATTACK_ON_COMMENTER:probability,-0.240236
ATTACK_ON_PUBLISHER:probability,-0.083863
constructive,1.0


# Correlation with perspective toxicity features

In [22]:
perspecitive_toxicity_feats = ['SEVERE_TOXICITY:probability', 
                               'SEXUALLY_EXPLICIT:probability',
                               'TOXICITY:probability', 
                               'TOXICITY_IDENTITY_HATE:probability',
                               'TOXICITY_INSULT:probability',
                               'TOXICITY_OBSCENE:probability',
                               'TOXICITY_THREAT:probability', 
                               'INFLAMMATORY:probability',
                               'LIKELY_TO_REJECT:probability', 
                               'OBSCENE:probability',
                              ]

In [23]:
get_corr_df(SOCC_df, perspecitive_toxicity_feats, annotation_cols)

Unnamed: 0,constructive
SEVERE_TOXICITY:probability,-0.031688
SEXUALLY_EXPLICIT:probability,0.061247
TOXICITY:probability,-0.026158
TOXICITY_IDENTITY_HATE:probability,0.205361
TOXICITY_INSULT:probability,0.015615
TOXICITY_OBSCENE:probability,-0.007402
TOXICITY_THREAT:probability,0.139481
INFLAMMATORY:probability,-0.091781
LIKELY_TO_REJECT:probability,-0.386832
OBSCENE:probability,-0.044699


# Length features experiments

In [24]:
MIN_LEN = 35
MAX_LEN = 65

In [25]:
SOCC_df['comment_len'] = SOCC_df['pp_comment_text'].apply(lambda x: len(x.split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [26]:
SOCC_df_subset = SOCC_df[ (SOCC_df['comment_len'] > MIN_LEN) & (SOCC_df['comment_len'] < MAX_LEN)]

In [27]:
SOCC_df_subset.shape

(2701, 53)

In [28]:
SOCC_df_subset['constructive'].value_counts()

0    1436
1    1265
Name: constructive, dtype: int64

In [29]:
run_cross_validation_experiments(SOCC_df_subset, Config.FEATURE_SETS)

Size of the training data:  2701 	Constructive ( 1265 ) 	Non constructive ( 1436 )
Cross validation folds:  10
Classifier:  SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=50, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=0.0001, verbose=0, warm_start=False)
Feature set:  ['text_feats', 'length_feats', 'argumentation_feats', 'COMMENTIQ_feats', 'named_entity_feats', 'constructiveness_chars_feats', 'non_constructiveness_chars_feats', 'toxicity_chars_feats', 'perspective_content_value_feats', 'perspective_aggressiveness_feats', 'perspecitive_toxicity_feats']


{'mean_score': 0.7976293254131059,
 'scores': array([0.808     , 0.8       , 0.76190476, 0.76724138, 0.77821012,
        0.808     , 0.78294574, 0.77235772, 0.85057471, 0.84705882]),
 'variance': 0.0008898250210476555}

# Error analysis 

# Get wrong predictions and write them in a CSV for error analysis

In [30]:
predicted, X_train, y_train = wrong_predictions_from_cross_validation(SOCC_df, Config.FEATURE_SETS)

Classifier:  SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=50, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=0.0001, verbose=0, warm_start=False)
Feature set:  ['text_feats', 'length_feats', 'argumentation_feats', 'COMMENTIQ_feats', 'named_entity_feats', 'constructiveness_chars_feats', 'non_constructiveness_chars_feats', 'toxicity_chars_feats', 'perspective_content_value_feats', 'perspective_aggressiveness_feats', 'perspecitive_toxicity_feats']


In [31]:
se = pd.Series(predicted)

In [32]:
X_train['Connstructive_gold'] = y_train

In [33]:
X_train['Constructive_predicted'] = se.values

In [34]:
X_train = X_train.sort_values(['Connstructive_gold', 'Constructive_predicted'], ascending=[1, 1])

In [35]:
X_train_wrong = X_train[X_train['Connstructive_gold'] != X_train['Constructive_predicted']]

In [36]:
X_train_wrong = X_train_wrong.sort_values(['Connstructive_gold', 'Constructive_predicted'], ascending=[1, 1])

In [39]:
cols = ['pp_comment_text', 'Connstructive_gold', 'Constructive_predicted', 'source', 'crowd_toxicity_level'] + Config.ALL_FEATURES 

In [40]:
X_train_wrong.to_csv(Config.RESULTS_PATH + 'SVM_wrong_predictions.csv', columns = cols, index = False)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  self.obj = self.obj.loc[:, cols]
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [41]:
X_train.to_csv(Config.RESULTS_PATH + 'SVM_predictions.csv', columns = cols, index = False)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  self.obj = self.obj.loc[:, cols]
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)
