In [1]:
import numpy as np

In [2]:
import pandas as pd
import numpy as np
import sys, os
import importlib
import textmining.cluster_analysis as cluster_analysis
import textmining.lexicons as lexicons
import textmining.machine_learning as machine_learning
import textmining.significance_testing as significance_testing
import iesta.loader as loader
import iesta.properties as prop  
import iesta.processor as proc  
import iesta.feature_extractor as fe

from collections import Counter

importlib.reload(cluster_analysis)
importlib.reload(lexicons)
importlib.reload(machine_learning)
importlib.reload(significance_testing)
importlib.reload(loader)
importlib.reload(prop)
importlib.reload(proc)
importlib.reload(fe)


import matplotlib.pyplot as plt

%matplotlib inline
import seaborn as sns

sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(6,4)}, 
    #style="white" # nicer layout
)

In [None]:

processor = proc.Process()

In [None]:
pd.read_parquet("../data/flat_voter_w_effect.parquet").head()

In [None]:

liberal_arguments_df, liberal_not_found = processor.get_ideology_based_voter_participant_df(prop.LIBERAL_IDEOLOGY)
conservative_arguments_df, conservative_not_found = processor.get_ideology_based_voter_participant_df(prop.CONSERVATIVE_IDEOLOGY)

liberal_arguments_df

# Extract Features

In [None]:
from datetime import datetime
type(datetime.now())

### LIWC

In [None]:
liwc_liberal_df = fe.extract_liwc(prop.LIBERAL_IDEOLOGY)
liwc_conservative_df = fe.extract_liwc(prop.CONSERVATIVE_IDEOLOGY)

### NRC emotion

In [None]:
nrc_liberal_df = fe.extract_nrc_emotion(prop.LIBERAL_IDEOLOGY)
nrc_conservative_df = fe.extract_nrc_emotion(prop.CONSERVATIVE_IDEOLOGY)

### MPQA Arg

In [None]:
mpqa_arg_liberal_df = fe.extract_mpqa_arg(prop.LIBERAL_IDEOLOGY)
mpqa_arg_conservative_df = fe.extract_mpqa_arg(prop.CONSERVATIVE_IDEOLOGY)

### EMPATH

In [None]:
empath_liberal_df = fe.extract_empath(prop.LIBERAL_IDEOLOGY)
empath_conservative_df = fe.extract_empath(prop.CONSERVATIVE_IDEOLOGY)

In [None]:
empath_ideology_liberal_df = fe.extract_empath_ideology(prop.LIBERAL_IDEOLOGY)
empath_ideology_conservative_df = fe.extract_empath_ideology(prop.CONSERVATIVE_IDEOLOGY)

In [None]:
empath_ideology_conservative_df.describe()

# Cluster Analysis

In [None]:
import imblearn

def undersample(df, strategy='.5'):
    col=df.columns
    features=col.tolist() 
    feature=features[:-1] 
    target=features[-1]
    X=df.loc[:,feature]  
    y=df.loc[:,target]
    
    undersample = imblearn.under_sampling.RandomUnderSampler(random_state=0)#, sampling_strategy=strategy)
    X_under, y_under = undersample.fit_resample(X, y)
    print(Counter(y_under))
    return pd.concat([X_under,y_under],axis=1)

In [None]:
cons_valid_indices = conservative_arguments_df.index.values.tolist()
lib_valid_indices = liberal_arguments_df.index.values.tolist()

In [None]:
conservative_features_df = nrc_conservative_df.merge(liwc_conservative_df, how='inner', left_index= True, right_index=True)
liberal_features_df = nrc_liberal_df.merge(liwc_liberal_df, how='inner', left_index= True, right_index=True)

conservative_features_df = conservative_features_df[conservative_features_df.index.isin(cons_valid_indices)]
liberal_features_df = liberal_features_df[liberal_features_df.index.isin(lib_valid_indices)]

conservative_features_df.drop(['argument'], axis=1, inplace=True)
liberal_features_df.drop(['argument'], axis=1, inplace=True)
len(conservative_features_df)


In [None]:
def prepare_data(df,  normalize=True, normalizing_method="sqrt"):#only_numeric for clustering
    df_ = df.copy()
    df_.fillna(0.0, inplace=True)
    ## Normalize for only training set for discourse level
    df_, _ = machine_learning.clip_outliers(df_, df_test =None, lower_percentile=1,  upper_percentile=99)
    if normalize:
        df_, _ = machine_learning.normalize(df_, None, normalizing_method=normalizing_method)
    return df_

In [None]:
def run_cluster_analysis(df,desc = "default"):

    ## INIT
    df_ = prepare_data(df,  normalize=True, normalizing_method="standard")

    print(df_.index.name)
    print('filtering our features wich has 75% of entries without a values')
    df_ = df_.loc[:,~df_.columns.duplicated()]
    desc_df= df_.describe().T
    desc_df= desc_df[desc_df['75%'] > 0]

    df_ = df_[desc_df.index.values]
    print(len(df_))

    ## CLUSTER ANALYSIS
    optimal_k = cluster_analysis.elbow(df_,normalize = True, visualize=True, k_range= range(2, 10) )
    print('optimal_k: ', optimal_k)
    analyzer_cos_kmeans = cluster_analysis.Analyzer(df_)
    analyzer_cos_kmeans.set_algorithm(algorithm= 'cosine_kmeans', algorithm_args={'n_clusters':optimal_k})
    analyzer_cos_kmeans.evaluate_silhouette_score(metric="cosine")
    #cluster_lbl_ = analyzer_cos_kmeans.predict_labels('id')

    ## PREDICT LABEL
    predicted_df = analyzer_cos_kmeans.predict_labels('numeric_id', apply=True)
    print('Plotting for:')
    print(analyzer_cos_kmeans.top_vars('numeric_id', top=4, plot=True))
    

    ## Significance Testing - 
    ## H0: There is no significance difference between features (e.g. Liwc tone)
    ##  across all clusters for ONE discourse level/setting
    ## EXAMPLE of H0: There is no significance difference between Liwc emotional tone 
    ##                across all cluster types for LEAD.
    significancy_result =significance_testing.significance(analyzer_cos_kmeans.df, #save=False, 
                                      #desc='significance_tests/{}/sigtest_{}'.format(desc, discourse_level),
                                      independent_var='cluster')

    return significancy_result, predicted_df

### Conservatives

In [None]:
conservative_significancy_result, conservative_predicted_df = run_cluster_analysis(conservative_features_df)

In [None]:
conservative_effect_cluster_df = conservative_predicted_df.merge(conservative_arguments_df[['effect']], 
                                how='inner', left_index= True, right_index=True)
#pd.crosstab(conservative_effect_cluster_df['effect'], conservative_effect_cluster_df['cluster'], normalize='index').plot(kind='bar')




### Liberals

In [None]:
liberal_significancy_result, liberal_predicted_df = run_cluster_analysis(liberal_features_df)

In [None]:

liberal_effect_cluster_df = liberal_predicted_df.merge(liberal_arguments_df[['effect']], 
                                how='inner', left_index= True, right_index=True)
#pd.crosstab(liberal_effect_cluster_df['effect'], liberal_effect_cluster_df['cluster'], normalize='index').plot(kind='bar')

# PCA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline

# PREPARE DATA FOR pca


#X_norm = (X - X.min())/(X.max() - X.min())
#X_standardized = StandardScaler().fit_transform(X)
def plot_pca(df, ideology):
    col=df.columns
    features=col.tolist() 
    feature=features[:-1] 
    target=features[-1]
    X=df.loc[:,feature].values  
    y=df.loc[:,target].values
    #Apply Standard Scaling
    sc=StandardScaler()  
    X=sc.fit_transform(X) 
    #pd.DataFrame(X,columns=feature).head()
    #Define two components  
    pca=PCA(n_components=2) 
    principalComponents=pca.fit_transform(X) 
    principalDf=pd.DataFrame(data=principalComponents,columns=['principal component 1','principal component 2']) 
    #principalDf.head()
    finalDf=pd.concat([principalDf,df[['effect']]],axis=1)
    
    fig=plt.figure(figsize=(8,8))  
    ax=fig.add_subplot(1,1,1)  
    ax.set_xlabel('Principal Component 1',fontsize = 15)  
    ax.set_ylabel('Principal Component 2',fontsize = 15)  
    ax.set_title(ideology,fontsize=20)  
    targets=list(df.effect.unique())
    colors= ['b', 'g', 'y', 'm']#sns.color_palette('deep')[:len(targets)]
    for target,color in zip(targets,colors):    
        indicesToKeep = finalDf['effect'] == target  
        ax.scatter(finalDf.loc[indicesToKeep,'principal component 1'],
                  finalDf.loc[indicesToKeep,'principal component 2'],
                 c=color,
                 s=50)
        ax.xaxis.label.set_color('white')
        ax.yaxis.label.set_color('white')
        ax.title.set_color('white')
        ax.tick_params(axis='x', colors='white')
        ax.tick_params(axis='y', colors='white')
        ax.legend(targets)  
        ax.grid()
    plt.figure(figsize=(16,10))
    
    sns.scatterplot(
        x="principal component 1", y="principal component 2",
        hue="effect",
        palette=sns.color_palette("hls", 4),
        data=finalDf.loc[indicesToKeep, :]
        legend="full",
        alpha=0.3)
    
    
    
    


In [None]:
conservative_features_w_effect_df = prepare_data(conservative_features_df, normalizing_method="standard")
conservative_features_w_effect_df = conservative_features_w_effect_df.merge(conservative_arguments_df[['effect']], 
                                                                   how='inner', left_index= True, right_index=True)



liberal_features_w_effect_df = prepare_data(liberal_features_df, normalizing_method="standard")
liberal_features_w_effect_df = liberal_features_w_effect_df.merge(liberal_arguments_df[['effect']], 
                                                                   how='inner', left_index= True, right_index=True)

plot_pca((conservative_features_w_effect_df), 'Conservative')
plot_pca((liberal_features_w_effect_df), 'Liberal')

In [None]:

# example of random oversampling to balance the class distribution

undersample(conservative_features_w_effect_df)

# Significance between Effects


In [None]:
def calc_sign_effects(df, ideology):
    return significance_testing.significance(df, save=True, 
                                          desc='../data/significant_test/{}_effect'.format(ideology),
                                          independent_var='effect')


conservative_features_w_effect_df = prepare_data(conservative_features_df, normalizing_method="standard")
conservative_features_w_effect_df = conservative_features_w_effect_df.merge(conservative_arguments_df[['effect']], 
                                                                   how='inner', left_index= True, right_index=True)

liberal_features_w_effect_df = prepare_data(liberal_features_df, normalizing_method="standard")
liberal_features_w_effect_df = liberal_features_w_effect_df.merge(liberal_arguments_df[['effect']], 
                                                                   how='inner', left_index= True, right_index=True)

In [None]:
conservative_significance_effect = calc_sign_effects(undersample(conservative_features_w_effect_df), 'conservative_undersampled')
liberal_significance_effect = calc_sign_effects(undersample(liberal_features_w_effect_df), 'liberal_undersampled')

In [None]:
cons_discriminative_vals = conservative_significance_effect[conservative_significance_effect['effective ineffective']>0.1
                                ].sort_values(by=['effective ineffective'], ascending=False).index.values.tolist()

cons_discriminative_vals = ', '.join(cons_discriminative_vals)
cons_discriminative_vals

#'liwc_netspeak, liwc_AllPunc, liwc_WC, 
#liwc_Dash, liwc_OtherP, nrc_anger, nrc_disgust, nrc_fear, nrc_negative, liwc_number'

In [None]:
lib_discriminative_vals = liberal_significance_effect[liberal_significance_effect['effective ineffective']>0.09
                                ].sort_values(by=['effective ineffective'], ascending=False).index.values.tolist()

lib_discriminative_vals = ', '.join(lib_discriminative_vals)
lib_discriminative_vals

# 'liwc_AllPunc, liwc_shehe, liwc_OtherP'