In [10]:
import csv
import pandas as pd
import nltk
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
#nltk.download('stopwords', 'punkt', 'wordnet')
import re 
from textblob import TextBlob
from collections import Counter, defaultdict
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textstat.textstat import textstatistics
import textstat
from ANEW_util import analyze_line
import spacy

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/hzhu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /home/hzhu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/hzhu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hzhu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/hzhu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [11]:
def open_file(path1):
    out = []
    with open(path1,"r", encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        
        for line in reader:
            out.append(line)
    df = pd.DataFrame(out, columns=["time", "text"])
    return df

In [34]:
path1 = './emma/name_text/POTUS_President Biden_1349149096909668363.csv'
df = open_file(path1)
sid = SentimentIntensityAnalyzer()

## Text clean

In [13]:
def clean_tweet(tweet):
    '''
    Utility function to clean the text in a tweet by removing 
    links and special characters using regex.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())


In [35]:
df['text'] = df['text'].apply(clean_tweet)

## Sentiment Analysis 

In [15]:
def analize_sentiment_textblob(tweet):
    '''
    Utility function to classify the polarity of a tweet
    using textblob. 
    '''
    analysis = TextBlob(clean_tweet(tweet))
    '''
    if analysis.sentiment.polarity > 0:
        return 1
    elif analysis.sentiment.polarity == 0:
        return 0
    else:
        return -1
    '''
    return float(analysis.sentiment.polarity)


def vader_index(tweets):
    sub_dict = defaultdict(list)
    keys = ['neg', 'neu', 'pos', 'compound']
    for text in tweets.values:
        score = sid.polarity_scores(text)
        for key in keys:
            sub_dict[key].append(score[key]) 
        if score['compound'] >= 0.05:
            cl = 1
        elif 0.05<score['compound']<0.05:
            cl = 0
        else:
            cl = -1
        sub_dict["sentiment_class_vader"].append(cl)
    return pd.DataFrame(sub_dict)

In [36]:
df = pd.concat([df, vader_index(df['text'])], axis=1)

## Readability 
(from https://www.geeksforgeeks.org/readability-index-pythonnlp/)
### The Dale Chall Formula:
Raw score = 0.1579*(PDW) + 0.0496*(ASL) + 3.6365  
Here,  
PDW = Percentage of difficult words not on the Dale–Chall word list.
ASL = Average sentence length
### The Gunning fog Formula
Grade level= 0.4 * ( (average sentence length) + (percentage of Hard Words) )  
Here, Hard Words = words with more than two syllables.
### Smog Formula
SMOG grading = 3 + √(polysyllable count).  
Here, polysyllable count = number of words of more than two syllables in a   
sample of 30 sentences.
### Flesch Formula
Reading Ease score = 206.835 - (1.015 × ASL) - (84.6 × ASW)  
Here,  
ASL = average sentence length (number of words divided by number of sentences)  
ASW = average word length in syllables (number of syllables divided by number of words)  


In [None]:
# Splits the text into sentences, using
# Spacy's sentence segmentation which can

def break_sentences(text):
	nlp = spacy.load('en_core_web_sm')
	doc = nlp(text)
	return list(doc.sents)

# Returns Number of Words in the text
def word_count(text):
	sentences = break_sentences(text)
	words = 0
	for sentence in sentences:
		words += len([token for token in sentence])
	return words

# Returns the number of sentences in the text
def sentence_count(text):
	sentences = break_sentences(text)
	return len(sentences)

# Returns average sentence length
def avg_sentence_length(text):
	words = word_count(text)
	sentences = sentence_count(text)
	average_sentence_length = float(words / sentences)
	return average_sentence_length

# Textstat is a python package, to calculate statistics from
# text to determine readability,
# complexity and grade level of a particular corpus.

def syllables_count(word):
	return textstatistics().syllable_count(word)

# Returns the average number of syllables per
# word in the text
def avg_syllables_per_word(text):
	syllable = syllables_count(text)
	words = word_count(text)
	ASPW = float(syllable) / float(words)
	return legacy_round(ASPW, 1)

# Return total Difficult Words in a text
def difficult_words(text):
	
	nlp = spacy.load('en_core_web_sm')
	doc = nlp(text)
	# Find all words in the text
	words = []
	sentences = break_sentences(text)
	for sentence in sentences:
		words += [str(token) for token in sentence]

	# difficult words are those with syllables >= 2
	# easy_word_set is provide by Textstat as
	# a list of common words
	diff_words_set = set()
	
	for word in words:
		syllable_count = syllables_count(word)
		if word not in nlp.Defaults.stop_words and syllable_count >= 2:
			diff_words_set.add(word)

	return len(diff_words_set)

# A word is polysyllablic if it has more than 3 syllables
# this functions returns the number of all such words
# present in the text
def poly_syllable_count(text):
	count = 0
	words = []
	sentences = break_sentences(text)
	for sentence in sentences:
		words += [token for token in sentence]
	

	for word in words:
		syllable_count = syllables_count(word)
		if syllable_count >= 3:
			count += 1
	return count


def flesch_reading_ease(text):
	"""
		Implements Flesch Formula:
		Reading Ease score = 206.835 - (1.015 × ASL) - (84.6 × ASW)
		Here,
		ASL = average sentence length (number of words
				divided by number of sentences)
		ASW = average word length in syllables (number of syllables
				divided by number of words)
	"""
	FRE = 206.835 - float(1.015 * avg_sentence_length(text)) -\
		float(84.6 * avg_syllables_per_word(text))
	return legacy_round(FRE, 2)


def gunning_fog(text):
	per_diff_words = (difficult_words(text) / word_count(text) * 100) + 5
	grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
	return grade


def smog_index(text):
	"""
		Implements SMOG Formula / Grading
		SMOG grading = 3 + ?polysyllable count.
		Here,
		polysyllable count = number of words of more
		than two syllables in a sample of 30 sentences.
	"""

	if sentence_count(text) >= 3:
		poly_syllab = poly_syllable_count(text)
		SMOG = (1.043 * (30*(poly_syllab / sentence_count(text)))**0.5) \
				+ 3.1291
		return legacy_round(SMOG, 1)
	else:
		return 0


def dale_chall_readability_score(text):
	"""
		Implements Dale Challe Formula:
		Raw score = 0.1579*(PDW) + 0.0496*(ASL) + 3.6365
		Here,
			PDW = Percentage of difficult words.
			ASL = Average sentence length
	"""
	words = word_count(text)
	# Number of words not termed as difficult words
	count = word_count - difficult_words(text)
	if words > 0:

		# Percentage of words not on difficult word list

		per = float(count) / float(words) * 100
	
	# diff_words stores percentage of difficult words
	diff_words = 100 - per

	raw_score = (0.1579 * diff_words) + \
				(0.0496 * avg_sentence_length(text))
	
	# If Percentage of Difficult Words is greater than 5 %, then;
	# Adjusted Score = Raw Score + 3.6365,
	# otherwise Adjusted Score = Raw Score

	if diff_words > 5:	

		raw_score += 3.6365
		
	return legacy_round(raw_score, 2)


In [37]:
def read_info(df):
    df["flesch_reading"] = df["text"].apply(textstat.flesch_reading_ease)
    df["smog_index"] = df["text"].apply(textstat.smog_index)
    df["flesch_kincaid"] = df["text"].apply(textstat.flesch_kincaid_grade) # like others, can also get from textacy
    df["coleman_liau"] = df["text"].apply(textstat.coleman_liau_index)
    df["automated_readability_index"] = df["text"].apply(textstat.automated_readability_index)
    df["dale_chall_readability"] = df["text"].apply(textstat.dale_chall_readability_score)

    return df

In [38]:
df = read_info(df)

In [39]:
#small scale test
df = pd.concat([df, analyze_line(df['text'].values[:5], mode='mean')], axis=1)

In [40]:
df.head()

Unnamed: 0,time,text,neg,neu,pos,compound,sentiment_class_vader,flesch_reading,smog_index,flesch_kincaid,...,automated_readability_index,dale_chall_readability,Valence,Arousal,Dominance,Average VAD,Sentiment Label,# Words Found,Found Words,All Words
0,2022-12-31T23:41:20.000Z,In 2022 we took on some of our nation s greate...,0.0,0.667,0.333,0.6705,1,74.19,0.0,6.4,...,5.8,11.1,5.6925,4.825,5.2475,5.255,neutral,4 out of 5,"[take, nation, challenge, deliver]","[take, nation, greatest, challenge, deliver]"
1,2022-12-31T19:30:14.000Z,Jill and I join Catholics and others around th...,0.052,0.746,0.202,0.8176,1,37.65,0.0,20.4,...,24.7,12.12,5.534706,3.774118,5.247647,4.852157,neutral,17 out of 24,"[join, catholic, world, mourn, pope, pope, rem...","[jill, join, catholic, others, around, world, ..."
2,2022-12-31T17:42:18.000Z,Barbara Walters has always been an example of ...,0.0,0.758,0.242,0.9153,1,25.8,0.0,20.8,...,24.8,11.83,6.233571,4.408571,5.782857,5.475,positive,14 out of 22,"[example, bravery, truth, break, barrier, driv...","[barbara, walter, always, example, bravery, tr..."
3,2022-12-31T17:00:00.000Z,Just 12 hours until many of the cost saving pr...,0.0,1.0,0.0,0.0,-1,60.65,0.0,9.5,...,9.9,10.4,5.312222,4.362222,5.408889,5.027778,neutral,9 out of 10,"[hour, cost, save, provision, inflation, reduc...","[hour, many, cost, save, provision, inflation,..."
4,2022-12-30T22:45:00.000Z,Jill and I send our deepest and heartfelt cond...,0.134,0.776,0.09,-0.1717,-1,53.89,0.0,14.2,...,17.2,9.97,5.634167,4.213333,5.575,5.140833,neutral,12 out of 17,"[send, heartfelt, prime, minister, loss, mothe...","[jill, send, deepest, heartfelt, condolence, p..."


## Data Process Pipeline

In [5]:
import os
from tqdm import tqdm
import pandas as pd
import csv

In [8]:
data_folder = '/ivi/ilps/personal/hzhu/dataset/emma/name_text/'
out_path = '/ivi/ilps/personal/hzhu/dataset/emma/extracted_features'
candidators = ['Barack', 'Mitt', 'Hillary', 'Donald', 'Joe', 'POTUS']
result_set = []
for name in os.listdir(data_folder):
    sub_result = sum([name.startswith(item) for item in candidators])
    if sub_result>0:
        result_set.append(name)

def data_pipeline(name_list, main_path, save_path, mode):
    for name in tqdm(name_list):
        path = os.path.join(main_path, name)
        df = open_file(path)
        df['text'] = df['text'].apply(clean_tweet)
        df = pd.concat([df, vader_index(df['text'])], axis=1)
        df = read_info(df)
        df = pd.concat([df, analyze_line(df['text'].values[:5], mode=mode)], axis=1)

        out_path = os.path.join(save_path, name)
        df.to_csv(out_path, index=False)
data_pipeline(result_set, data_folder, out_path, mode='mean')

