In [None]:
pip install lime

In [None]:
from IPython.display import Markdown, display
import pandas as pd
import numpy as np
import nltk 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import auc
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from pandas import DataFrame
from collections import OrderedDict
from sklearn.model_selection import StratifiedKFold 

import seaborn as sns
from collections import Counter

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

import lime
import lime.lime_tabular
from lime.lime_text import LimeTextExplainer

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from pandas import Panel


https://github.com/shresthaanu/ECIR21TextualCharacteristicsOfFakeNews/blob/main/Code/step4_statistical_test.ipynb

# Read data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive') #, force_remount = True

Mounted at /content/gdrive


In [None]:
df = pd.read_pickle("./gdrive/MyDrive/IR_Lab/Datasets/final_features_combined.pkl")

In [None]:
# Remove non-numeric columns
non_numeric_cols = ['title', 'body', 'lang',
       'title_token', 'body_token', 'title_processed', 'body_processed', 'title_lemma','body_lemma',
        'label']
print("non-numeric cols",len(non_numeric_cols))
X = df.drop(non_numeric_cols, axis = 1)

title_columns = set(X.columns[X.columns.str.startswith('title')])
print('title cols',len(title_columns))
body_columns  = set(X.columns[X.columns.str.startswith('body')])
print('body cols',len(body_columns))

#print("Intersection of title and body cols",title_columns.intersection(body_columns))
#print("Union of title and body cols",title_columns.intersection(body_columns))
print("Total cols",len(df.columns))

non-numeric cols 10
title cols 241
body cols 243
Total cols 494


In [None]:
print(df.label.value_counts())

# Sample the min of both Fake and Real i.e. Real
df_true = df[df.label == 1].sample(len(df[df.label == 0])) 
df_fake = df[df.label == 0].sample(len(df[df.label == 0]))
df_final = pd.DataFrame(np.concatenate([df_true, df_fake]), columns = df_true.columns)

print(df_final.label.value_counts())

1    33229
0    27907
Name: label, dtype: int64
1    27907
0    27907
Name: label, dtype: int64


## Stylistic features

-  Text statistics like word count,  words_per_sentence etc.
-  POS features

In [None]:
title_pos_features = ["title_''",'title_CC','title_CD','title_DT','title_EX','title_FW','title_IN','title_JJ','title_JJR','title_JJS','title_MD','title_NN','title_NNP','title_NNPS','title_NNS',
 'title_PDT','title_POS','title_PRP','title_PRP$','title_RB','title_RBR','title_RBS','title_RP','title_SYM','title_TO','title_UH',
 'title_VB','title_VBD','title_VBG','title_VBN','title_VBP','title_VBZ','title_WDT','title_WP','title_WP$','title_WRB']

In [None]:
title_text_statistics = ['title_length','title_sent_count','title_word_count']

In [None]:
title_stylistic_features = list(set(title_pos_features).union(set(title_text_statistics)))

In [None]:
body_pos_features = ['body_$',
 "body_''",
 'body_CC',
 'body_CD',
 'body_DT',
 'body_EX',
 'body_FW',
 'body_IN',
 'body_JJ',
 'body_JJR',
 'body_JJS',
 'body_MD',
 'body_NN',
 'body_NNP',
 'body_NNPS',
 'body_NNS',
 'body_PDT',
 'body_POS',
 'body_PRP',
 'body_PRP$',
 'body_RB',
 'body_RBR',
 'body_RBS',
 'body_RP',
 'body_SYM',
 'body_TO',
 'body_UH',
 'body_VB',
 'body_VBD',
 'body_VBG',
 'body_VBN',
 'body_VBP',
 'body_VBZ', 
 'body_WDT',
 'body_WP',
 'body_WP$',
 'body_WRB',
 'body_``']

In [None]:
body_text_statistics = ['body_length','body_sent_count','body_word_count']

In [None]:
body_stylistic_features =  list(set(body_pos_features).union(set(body_text_statistics)))

## Psychology features

- Sentiment analysis features from Vader, Textblob etc
- Emotion features from Empath

In [None]:
title_sentiment_features  = ['title_compound_sentiment','title_polarity', 'title_subjectivity'] 
title_emotion_features = [col for col in title_columns if col.startswith('title_emo')]

In [None]:
title_psychology_features = list(set(title_sentiment_features).union(set(title_emotion_features)))

In [None]:
body_sentiment_features = ['body_polarity', 'body_subjectivity'] 
body_emotion_features = [col for col in body_columns if col.startswith('body_emo')]

In [None]:
body_psychology_features = list(set(body_sentiment_features).union(set(body_emotion_features)))

## Complexity Features
-  Common readability measures
-  Type Token Ratio  (Not for title)
-  avg word len,  avg sentence len

In [None]:
title_complexity_features = ['title_avg_sent_len', 'title_avg_word_len','title_flesch_kincaidgrade',
 'title_flesch_readability',
 'title_gunning_fog']

In [None]:
body_complexity_features = ['body_avg_sent_len', 'body_avg_word_len','body_flesch_kincaidgrade',
 'body_flesch_readability',
 'body_gunning_fog','body_TTR']

In [None]:
print(len(title_complexity_features))
print(len(title_psychology_features))
print(len(title_stylistic_features))
print(len(title_columns),'==',5+197+39)

5
197
39
241 == 241


In [None]:
print(len(body_complexity_features))
print(len(body_psychology_features))
print(len(body_stylistic_features))
print(len(body_columns),'==',6+196+41)

6
196
41
243 == 243


# Statistical tests

In [None]:
## statistical T-test
from scipy.stats import ttest_ind
def t_test(df):
  #df.label = df.label.apply(lambda x: 1 if x in ["fake","Fake",'1',1] else 0)
  real_news = df[df.label == 1]  # Labels interchanged 
  fake_news = df[df.label == 0]
  
  print(real_news.shape, fake_news.shape)

  selected_features = [ col for col in df.columns if col not in ['label','news_id','news_title','news_text']]
  frames=[]
  for feature in selected_features:
      t_stat,p_value = ttest_ind(real_news[feature], fake_news[feature])
      frames.append([feature,t_stat,p_value])
  t_test_result = pd.DataFrame(frames, columns=['feature','statistics','Pvalue'])
  t_test_result.sort_values(by='Pvalue', ascending=True, inplace=True)
  statistical_sig_result = t_test_result[t_test_result['Pvalue'] < 0.05]
  statistical_sig_result['feature_differ'] = statistical_sig_result.apply(lambda x: 'Real > Fake' if x['statistics'] > 0 else 'Fake > Real', axis=1)
  return statistical_sig_result

# Title

In [None]:
stat_title = t_test(df_final[list(set(title_columns).union(['label']))])

(27907, 242) (27907, 242)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
stat_title

Unnamed: 0,feature,statistics,Pvalue,feature_differ
163,title_emo_musical,-74.890700,0.000000,Fake > Real
28,title_emo_social_media,-70.824469,0.000000,Fake > Real
147,title_length,-71.632562,0.000000,Fake > Real
105,title_avg_sent_len,-68.911895,0.000000,Fake > Real
139,title_emo_messaging,-72.126693,0.000000,Fake > Real
...,...,...,...,...
206,title_emo_stealing,-2.216519,0.026660,Fake > Real
125,title_emo_farming,2.100470,0.035692,Real > Fake
51,title_emo_blue_collar_job,-2.062186,0.039195,Fake > Real
140,title_FW,2.003095,0.045172,Real > Fake


# Body

In [None]:
stat_body = t_test(df_final[list(set(body_columns).union(['label']))])

(27907, 244) (27907, 244)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
stat_body

Unnamed: 0,feature,statistics,Pvalue,feature_differ
49,body_emo_negative_emotion,-48.391989,0.000000,Fake > Real
162,body_emo_messaging,-42.595888,0.000000,Fake > Real
231,body_emo_real_estate,40.233389,0.000000,Real > Fake
177,body_emo_swearing_terms,-39.249110,0.000000,Fake > Real
214,body_emo_government,71.078243,0.000000,Real > Fake
...,...,...,...,...
110,body_emo_fun,-2.353325,0.018610,Fake > Real
153,body_emo_crime,2.320045,0.020342,Real > Fake
51,body_TO,-2.308327,0.020985,Fake > Real
139,body_emo_eating,-2.123810,0.033690,Fake > Real


In [None]:
features_used_in_paper = ['Analytic','insight','cause','discrep','tentat','certain','differ','affiliation','power','reward','risk','work','leisure',
                            'money','relig','Tone','affect','WC','WPS','num_nouns','num_propernouns','num_personalnouns','num_ppssessivenouns',
                            'num_whpronoun','num_determinants','num_whdeterminants','num_cnum','num_adverb','num_interjections','num_verb','num_adj',
                            'num_vbd','num_vbg','num_vbn','num_vbp','num_vbz','focuspast','focusfuture','i','we','you','shehe','quant','compare','Exclam',
                            'negate','swear','netspeak','interrog','count_uppercased','percentage_stopwords','AllPunc','Quote', 'lexical_diversity','wlen',
                            'gunning_fog_index','smog_index','flesch_kincaid_grade_level',
                          'Anger','Anticipation','Disgust','Fear','Joy', 'Sadness', 'Surprise', 'Trust','neg','pos','posemo','negemo','anx'
                          
                         ,'title_length', 'title_sent_count', 'title_word_count','body_length', 'body_sent_count', 'body_word_count',
                          'title_avg_sent_len','title_avg_word_len','title_flesch_kincaidgrade','title_flesch_readability','title_gunning_fog',
                          'body_avg_sent_len', 'body_avg_word_len', 'body_flesch_kincaidgrade', 'body_flesch_readability', 'body_gunning_fog','body_TTR',
                          'title_compound_sentiment', 'title_polarity', 'title_subjectivity','body_polarity', 'body_subjectivity'
                          ] # Add our feature names

def num_sig(df):
  ddf_sig = df[df.feature.isin(features_used_in_paper)]
  return ddf_sig

In [None]:

df_body = num_sig(stat_body)

df_title = num_sig(stat_title)

# complexity, statistics, sentiment

In [None]:
df_title

Unnamed: 0,feature,statistics,Pvalue,feature_differ
147,title_length,-71.632562,0.0,Fake > Real
105,title_avg_sent_len,-68.911895,0.0,Fake > Real
89,title_word_count,-76.477412,0.0,Fake > Real
88,title_subjectivity,-37.372058,6.07963e-302,Fake > Real
221,title_flesch_kincaidgrade,-19.301883,9.649231e-83,Fake > Real
99,title_compound_sentiment,18.807574,1.14754e-78,Real > Fake
18,title_polarity,13.406088,6.445475999999999e-41,Real > Fake
37,title_gunning_fog,-6.757271,1.419913e-11,Fake > Real
29,title_avg_word_len,3.066877,0.002164111,Real > Fake


In [None]:
df_body

Unnamed: 0,feature,statistics,Pvalue,feature_differ
12,body_subjectivity,-72.652087,0.0,Fake > Real
77,body_avg_word_len,56.712293,0.0,Real > Fake
203,body_TTR,-13.798719,3.055684e-43,Fake > Real
98,body_word_count,10.66929,1.5031469999999998e-26,Real > Fake
238,body_length,10.533655,6.392612e-26,Real > Fake
144,body_flesch_readability,-9.780161,1.428379e-22,Fake > Real
178,body_flesch_kincaidgrade,8.934099,4.226816e-19,Real > Fake
151,body_avg_sent_len,8.920227,4.790445e-19,Real > Fake
138,body_gunning_fog,8.781992,1.6505e-18,Real > Fake
191,body_sent_count,5.307482,1.115735e-07,Real > Fake


# POS

Features used in original paper: 

https://github.com/shresthaanu/ECIR21TextualCharacteristicsOfFakeNews/blob/main/Code/step4_statistical_test.ipynb

In [None]:
og_pos = ['num_nouns','num_propernouns','num_personalnouns','num_ppssessivenouns',
                            'num_whpronoun','num_determinants','num_whdeterminants','num_cnum','num_adverb','num_interjections','num_verb','num_adj',
                            'num_vbd','num_vbg','num_vbn','num_vbp','num_vbz']

Corresponding nltk pos names:
https://www.ling.upenn.edu/courses/Fall_2003/ling001penn_treebank_pos.html

In [None]:
corr_nltk_pos = ['NN','NNS','NNP','PRP','PRP$','WP','DT','WDT','RB','UH','JJ','VB','VBD','VBG','VBN','VBP','VBZ']

In [None]:
title_corr_nltk_pos = ["title_"+feat for feat in corr_nltk_pos if "title_"+feat in title_pos_features]
print(len(title_corr_nltk_pos), len(corr_nltk_pos))
title_corr_nltk_pos

17 17


['title_NN',
 'title_NNS',
 'title_NNP',
 'title_PRP',
 'title_PRP$',
 'title_WP',
 'title_DT',
 'title_WDT',
 'title_RB',
 'title_UH',
 'title_JJ',
 'title_VB',
 'title_VBD',
 'title_VBG',
 'title_VBN',
 'title_VBP',
 'title_VBZ']

In [None]:
body_corr_nltk_pos = ["body_"+feat for feat in corr_nltk_pos if "body_"+feat in body_pos_features]
print(len(body_corr_nltk_pos), len(corr_nltk_pos))
body_corr_nltk_pos

17 17


['body_NN',
 'body_NNS',
 'body_NNP',
 'body_PRP',
 'body_PRP$',
 'body_WP',
 'body_DT',
 'body_WDT',
 'body_RB',
 'body_UH',
 'body_JJ',
 'body_VB',
 'body_VBD',
 'body_VBG',
 'body_VBN',
 'body_VBP',
 'body_VBZ']

In [None]:
features_used_in_paper = set(body_corr_nltk_pos).union(title_corr_nltk_pos)

def num_sig(df):
  ddf_sig = df[df.feature.isin(features_used_in_paper)]
  return ddf_sig

In [None]:

df_body_pos = num_sig(stat_body)

df_title_pos = num_sig(stat_title)

In [None]:
df_body_pos

Unnamed: 0,feature,statistics,Pvalue,feature_differ
235,body_VBD,46.733568,0.0,Real > Fake
159,body_PRP,-26.681251,7.424770000000001e-156,Fake > Real
43,body_NNS,22.832839,7.291385999999999e-115,Real > Fake
213,body_VBZ,-18.112236,4.136549e-73,Fake > Real
127,body_DT,-16.354982,5.53016e-60,Fake > Real
167,body_RB,-14.685411,9.860204e-49,Fake > Real
123,body_NN,13.683157,1.49975e-42,Real > Fake
22,body_NNP,12.862985,8.230092999999999e-38,Real > Fake
85,body_VBN,11.695357,1.4654650000000001e-31,Real > Fake
108,body_WDT,-11.324311,1.071719e-29,Fake > Real


In [None]:
df_title_pos

Unnamed: 0,feature,statistics,Pvalue,feature_differ
181,title_VBG,-37.23894,7.756053999999999e-300,Fake > Real
196,title_VBD,-33.400883,3.29537e-242,Fake > Real
232,title_RB,-28.761816,1.351181e-180,Fake > Real
148,title_PRP,-24.615283,4.459437e-133,Fake > Real
87,title_NN,-22.747591,5.016353e-114,Fake > Real
84,title_VBN,-18.239686,4.106751e-74,Fake > Real
193,title_NNS,11.325051,1.062719e-29,Real > Fake
223,title_DT,-9.670649,4.1820550000000004e-22,Fake > Real
119,title_VBZ,6.05056,1.452586e-09,Real > Fake
77,title_VBP,-5.382373,7.380594e-08,Fake > Real


T-tests for large samples

https://stats.stackexchange.com/questions/9573/t-test-for-non-normal-when-n50