In [81]:
import re
import string

import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from wordcloud import WordCloud

from topic_utils import *

## Functions on strings

In [171]:
def preprocess_tweet(string):
    """Remove symbols; replace urls, hashtags, and user 
       mentions with a placeholder token.
    """
    # "rt" ("retweet") 
    string = re.sub('rt', '', string.lower())
    
    # @-mentions
    string = re.sub(r'@\w+', '<-@->', string)
    
    # hyperlinks
    string = re.sub(r'http\S+', '<-URL->', string)
    string = re.sub(r'www.[^ ]+', '<-URL->', string)
    
    # hashtags
    string = re.sub(r'#\w+', '<-#->', string)
    
    # digits
    string = re.sub(r'[0-9]+', '', string)
    
    # symbols
    string = re.sub(r'[!"$%&()*+,./:;=?[\]^_`{|}~]', '', string)
    
    return string

In [172]:
def tokenize_string(string):
    tokens = (TweetTokenizer(preserve_case=False, 
                            reduce_len=True, 
                            strip_handles=False)
              .tokenize(string))
    # remove symbol-only tokens
    tokens = [t for t in tokens if not t in string.punctuation]
    return tokens

## Functions on dataframes
For use with ```.pipe()``` and ```.apply()```

In [76]:
def preprocess_tweet(df):
    """Remove symbols; replace urls, hashtags, and user 
       mentions with a placeholder token.
    """
    # "rt" ("retweet") 
    df = df.str.lower().replace('rt', '')
    
    # @-mentions
    df = df.str.replace(r'@\w+', '<-@->')
    
    # hyperlinks
    df = df.str.replace('http\S+', '<-URL->')
    df = df.str.replace('www.[^ ]+', '<-URL->')
    
    # hashtags
    df = df.str.replace(r'#\w+', '<-#->')
    
    # digits
    df = df.str.replace(r'[0-9]+', '')
    
    # symbols
    df = df.str.replace(r'[!"$%&()*+,./:;=?[\]^_`{|}~]', '')
    
    return df

In [86]:
def tweet_tokenize(df):
    """
    Convert `in_string` of text to a list of tokens using NLTK's TweetTokenizer
    """
    tokenizer = TweetTokenizer(preserve_case=False,
                               reduce_len=True,
                               strip_handles=False)
    df = df.map(tokenizer.tokenize)
    return df

def tokenize(df):
    """Apply nltk tokenizing function to a dataframe,
       removing single-character tokens.
    """
    df = df.map(word_tokenize)
    df = df.apply(lambda x: [x for token in x if len(token)>2])
    return df

def remove_stopwords(df):
    """Remove stop words, based on nltk list."""
    
    cache = set(stopwords.words())
    df = df.apply(lambda x: [word for word in x 
                                    if word.lower() not in cache])
    return df

def lemmatize(df):
    """Lemmatize using nltk WordNet method."""
    lemmatizer = WordNetLemmatizer()
    
    df = df.apply(lambda x: [lemmatizer.lemmatize(word)
                                  for word in x
                                  if len(lemmatizer.lemmatize(w))>3])
    return df

def preprocess_words(df, tweet_tokenizer=True, lemmatize=False):
    """Apply word-level preprocessing."""
    
    if tweet_tokenizer:
        token_fn = tweet_tokenize
    else: token_fn = tokenize
        
    if lemmatize:
        df = df.pipe(token_fn).pipe(remove_stopwords).pipe(lemmatize)
        return df
    
    df = df.pipe(token_fn).pipe(remove_stopwords)
    return df

def preprocess_text(self, tweet_tokenize, lemmatize):
    """Pre-process text in a dataframe column.

    Usage:

        df['processed_text'] = preprocess_data(df['not_processed_text'])

    """
    return df.pipe(preprocess_string).pipe(preprocess_words)

In [24]:
def all_text(list_of_strings):
    """
    Concatenate a list of strings into a single string.
    """
    return ' '.join([string for string in list_of_strings])

def all_tokens(list_of_lists):
    """
    Concatenate items from multiple lists into a single list.
    """
    return list(itertools.chain(*list_of_lists))

In [238]:
def tokenize_iter(df_of_lists):
    """
    Usage: df.pipe(tokenize_df)
    """
    # Concatenate into a long string to be tokenized
    long_string = all_text(all_tokens(df_of_lists))
    preprocessed = preprocess_tweet(long_string)
    
    tokens = (TweetTokenizer(preserve_case=False, 
                            reduce_len=True, 
                            strip_handles=False)
              .tokenize(preprocessed))
    # remove symbol-only tokens
    tokens = [t for t in tokens if not t in string.punctuation]
    return tokens

In [239]:
def tokenize_list(list_of_strings):
    """
    Usage: df.map(tokenize_list)
    """
    long_string = all_text(list_of_strings)
    preprocessed = preprocess_tweet(long_string)
    
    tokens = (TweetTokenizer(preserve_case=False, 
                            reduce_len=True, 
                            strip_handles=False)
              .tokenize(preprocessed))
    # remove symbol-only tokens
    tokens = [t for t in tokens if not t in string.punctuation]
    return tokens

### testing

list of strings:

In [242]:
text_df = df.loc[:]['tweet_text'].head(50)
text_df.head(10)

tweetid
1331706590525874184    Правительство Франции заявляет, что не признае...
1100358276435398656    Interview w/ Ayaz Mutalibov, first President o...
1100389340914569216    Dana Mazalova, Czech Journalist, is the author...
724982683118358528     Baku Declaration of the 7th UNAOC Global Forum...
728142042765742080     FM Mammadyarov: #Azerbaijan is &amp; will rema...
727245477867974656     Azerbaijani army is and will defend its citize...
728581480545325057     Terter'de Ermenilerin sivilleri hedef alması s...
650550450320601088     newsazerbaijan.ru У берегов Испании спасены ок...
661914640331333632     newsazerbaijan.ru РФ подтвердила защиту взаимн...
535744142437933056     newsazerbaijan.ru В 2014 году в Азербайджане з...
Name: tweet_text, dtype: string

In [243]:
tokenized_text = tokenize_list(text_df)
tokenized_text

['правительство',
 'франции',
 'заявляет',
 'что',
 'не',
 'признает',
 'карабах',
 '<-url->',
 '<-#->',
 '<-#->',
 '<-#->',
 '<-url->',
 'interview',
 'w',
 'ayaz',
 'mutalibov',
 'first',
 'president',
 'of',
 '<-#->',
 '“',
 'the',
 'corridor',
 'by',
 'which',
 'people',
 'could',
 'escape',
 'had',
 'nonetheless',
 'been',
 'left',
 'by',
 'the',
 'armenians',
 'so',
 'why',
 'would',
 'they',
 'fire',
 '”',
 '<-#->',
 '<-#->',
 '<-url->',
 'dana',
 'mazalova',
 'czech',
 'journalist',
 'is',
 'the',
 'author',
 'of',
 'the',
 'famous',
 'interview',
 'with',
 'ayaz',
 'mutalibov',
 'published',
 'in',
 '“',
 'nezavisimaya',
 'gazeta',
 '”',
 'where',
 'mutalibov',
 'acknowledges',
 'the',
 'existence',
 'of',
 'the',
 'humanitarian',
 'corridor',
 'left',
 'by',
 'the',
 'armenian',
 'side',
 '<-#->',
 '<-#->',
 '<-url->',
 'baku',
 'declaration',
 'of',
 'the',
 'th',
 'unaoc',
 'global',
 'forum',
 '<-url->',
 '<-url->',
 'fm',
 'mammadyarov',
 '<-#->',
 'is',
 'amp',
 'will',


df/list of list of strings:

In [240]:
list_df = (df
           .groupby('userid')[['tweet_text']]
           .agg(lambda x: list(x))
          ).head(50)
list_df

Unnamed: 0_level_0,tweet_text
userid,Unnamed: 1_level_1
+JkWMulEtCyTrcFDRO2XLv9EOdGHDl0GB9cdZUWgtA=,[America and the way its government treats its...
+fwTi4Wv1fs5sua3wZXqtBWBMMAy5IKNd5euWlP8Kuk=,[@ManotoNews اگه ایران به یمن و مقاومت کمک نکن...
0+2DdcWQlF1LIe31q4foyvQMZYObIOeoh5woH5+4ySo=,[A group belonging to the HTS kidnapped Yasser...
07CPSEe0H6QwZcanuJd4G4sYBjmx+NtXpcj2NdAAmr0=,[#США #Япония #Россия #Китай #Корея #Дяоюйдао ...
0T+oJ4XBPG6ZbvgO0NQ+c+u6aQ5oDuzGtyT8lMLPEFM=,"[Вот уж не думал, что Рашка сможет, Самое ""уда..."
0gTQ2cDCHFpYXKO+G367F1HBrPLupiuPjXuvmp9UL+w=,[People are not silent! This structure must ch...
0hVjtURHlBEHZhn22rNDf98r+8VUXV3gi1bxvAhrZo=,[El pueblo indígena Kayapo cerró una important...
0zCl5U0pYu0gEmK3JtjO5fbnxEj6pO9GUgH52Q6yg0E=,[@VoteMarsha Why Women Prefer to Vote for an r...
1067814896706994176,[Inilah bentuk rasionalitas islam dan ketiadaa...
1091795789263921152,[اگه امشب مولانا توییتر میومد از همه مون راضی ...


In [241]:
list_df = list_df.loc[:]['tweet_text']
tokenized_iter = tokenize_iter(list_df)
tokenized_iter

['america',
 'and',
 'the',
 'way',
 'its',
 'government',
 'treats',
 'its',
 'people',
 'must',
 'be',
 'reformed',
 '<-url->',
 '<-url->',
 'poland',
 'police',
 'bureau',
 'is',
 'preparing',
 'for',
 'a',
 'variety',
 'of',
 'mass',
 'gathering',
 'events',
 'being',
 'planned',
 'for',
 'saturday',
 'september',
 'at',
 'this',
 'time',
 'one',
 'group',
 'has',
 'been',
 'announced',
 'they',
 'will',
 'hold',
 'an',
 'event',
 'at',
 'delta',
 'park',
 'at',
 'noon',
 '<-#->',
 '<-#->',
 '<-#->',
 '<-#->',
 '<-url->',
 'why',
 'lies',
 'why',
 'hypocrisy',
 'why',
 'security',
 'forces',
 'why',
 'racism',
 'why',
 'cut',
 'budgets',
 'to',
 'deceive',
 'people',
 'no',
 'trump',
 'no',
 'no',
 'no',
 '<-#->',
 '<-#->',
 '<-#->',
 '<-#->',
 '<-url->',
 'difference',
 '<-#->',
 '<-url->',
 'he',
 'has',
 'destroyed',
 'every',
 'busines',
 '🐂',
 '🐃',
 '🐃',
 '🐃',
 '<-#->',
 '<-url->',
 'even',
 'if',
 'the',
 'power',
 'to',
 'negotiate',
 'deals',
 'is',
 'handed',
 'over',
 'to

# Text processing

## Pipeline:
We are starting with a series of strings representing individual tweets. We can perform our analysis on
- individual tweets
- all tweets for a grouping of the dataframe (e.g. group by userid)
- all tweets in the corpus

In addition,
- hashtags and urls are already in a separate column.

In general, we want to convert tweet text into tokens for analysis. We will make utility functions which anticipate tokenizing for these levels of analysis.

In [8]:
users = UsersData('../data/users')
tweets = TweetsData('../data/tweets')

In [16]:
df = tweets.df.loc[:][:]

In [17]:
campaign = df['campaign'] == 'iran202012'
iran = df[campaign]

Aggregation for tweet text is defined as joining tweet strings into a list of strings:

In [257]:
user_tweets = (iran
               .groupby('userid')[['tweet_text']]
               .agg(lambda x: list(x))
              )

In [258]:
iter_of_lists = user_tweets
iter_of_lists

Unnamed: 0_level_0,tweet_text
userid,Unnamed: 1_level_1
+JkWMulEtCyTrcFDRO2XLv9EOdGHDl0GB9cdZUWgtA=,[America and the way its government treats its...
+fwTi4Wv1fs5sua3wZXqtBWBMMAy5IKNd5euWlP8Kuk=,[@ManotoNews اگه ایران به یمن و مقاومت کمک نکن...
0gTQ2cDCHFpYXKO+G367F1HBrPLupiuPjXuvmp9UL+w=,[People are not silent! This structure must ch...
0hVjtURHlBEHZhn22rNDf98r+8VUXV3gi1bxvAhrZo=,[El pueblo indígena Kayapo cerró una important...
0zCl5U0pYu0gEmK3JtjO5fbnxEj6pO9GUgH52Q6yg0E=,[@VoteMarsha Why Women Prefer to Vote for an r...
...,...
y3KkURpZFjT+WeW9e6BcxBYRg311F8fz1eJ647ahQc=,[Take a good look at #Trump and tweet me if yo...
z3nCVBEHiIbcBhhxU2mOz5iWK4a7sUdGmRSPFM16G0=,[Hapus zionisme hapus penjajahan.\n#Palestine\...
zFlH+vHUhiZD2qvvCLYyiU76qOha9+iYxCn1NVmzw=,[Karl Marx is the leader of These Do nothing D...
zTUtu8WZ3RwxnwgMsYXnTU107UXsn4MQU5wrg8IDOU=,[I am sick and tired of being lied to by #real...


In [262]:
print(iter_of_lists.iloc[0][0])

['America and the way its government treats its people must be reformed https://t.co/DupN3cE2dd', 'https://t.co/ee5KZJMm5R', 'Portland Police Bureau is preparing for a variety of mass gathering events being planned for Saturday, September 26, 2020. At this time, one group has been announced they will hold an event at Delta Park at noon.  #PoliceLivesMatter  #Polizeigewalt  #Portland  #PortlandProtests https://t.co/3JSoa0UcOL', 'Why lies ..... why hypocrisy .... why security forces .... why racism ..... why cut budgets .... to deceive people    No Trump       No No   No #ARMY  #COVIDー19  #Biden #Trump https://t.co/v1WYXyidzJ', 'Difference #Biden https://t.co/Vl5lB7q2l6', 'He has destroyed every busines......🐂🐃🐃🐃🐃🐃 #TrumpMeltdown https://t.co/tKrPEdujCZ', 'Even if the power to negotiate deals is handed over to the presidency, Congress will still have the final say, and judging by the current political climate, Johnson will not fire him this time #Trump  #Covid_19  #DemocracyDay https://t

In [266]:
all_tokens(iter_of_lists['tweet_text'])[.head()]

AttributeError: 'list' object has no attribute 'head'

In [None]:
print(preprocess_tweet(all_tokens(iter_of_lists)))

## Frequency

In [None]:
def word_frequency(iterable_of_lists):
    """
    
    Args:
        iterable_of_lists: e.g. a dataframe or list of lists [of strings]
    Return:
        dict of {'word':'frequency'} sorted by frequency (high to low)
    N.B.:
        list(itertools.chain(*iterable_of_lists_of_strings))
        # vs
        list(*itertools.chain.from_iterable(iterable_of_lists_of_strings))
    """
    
    all_words = list(*itertools.chain.from_iterable(iterable_of_lists))
    word_dict = dict()

    for word in all_words:
        if word in word_dict:
            word_dict[word] += 1
        else:
            word_dict[word] = 1

    # counts = collections.Counter(iterable_of_lists_of_strings)
    
    return sorted(word_dict.items(), key=lambda item: item[1], reverse=True)

In [256]:
tokenized_df = tokenize_iter(iter_of_lists)
tokenized_df

TypeError: tokenize_iter() takes 1 positional argument but 209 were given

In [None]:
freq_dict = word_frequency(tokenized_df)
freq_dict

## Vectorizing

Using sklearn's CountVectorizer to turn the corpus into a text-term matrix allows us to easily count tokens. CountVectorizer can count n-grams as well as tokens.

In [None]:
def ngram_freq_matrix(list_of_strings,
                      stop_words=None,
                      ngram_range=(1,2)):
    """
    Args: 
       list_of_strings: iterable of strings
       stop_words: a list of stop words or the string 'english' to use a
                   built-in English language stop word list.
                   Default: no stop words
       ngram_range: a single int, or a 2 tuple representing the range of ngrams to count.
                    Default: (1,2); counts 1- and 2- grams.
    Return:
       dataframe of counts indexed by n-gram
    """
    vectorizer = CountVectorizer(analyzer='word',
                                 tokenizer=tweet_tokenizer,
                                 stop_words=stop_words,
                                 ngram_range=ngram_range
                                )
    
    ngram_freq_matrix = count_vectorizer.fit_transform(list_of_strings)
    ngrams = count_vectorizer.get_feature_names()

    return ngram_freq_matrix, ngrams

def count_freq_matrix(ngram_freq_matrix):
    """
    """
    ngram_frequencies = term_freq_matrix.sum(axis=0).tolist()[0]
    freq_dict = dict(zip(terms, term_frequencies))
    
    return (pd.DataFrame(freq_dict, 
                         index_column='ngram'
                         columns=['ngram', 'count'])
                        .sort_values("count", ascending=False))

In [210]:
all_words = all_text(all_tokens(list_df.head()))

In [212]:
all_words = preprocess_tweet(all_words)

In [219]:
all_words = (TweetTokenizer(preserve_case=False, 
                            reduce_len=True, 
                            strip_handles=False)
              .tokenize(all_words))

In [223]:
tagged = nltk.pos_tag(all_words)

In [227]:
words = tokenize_df(list_df)

In [228]:
words

['america',
 'and',
 'the',
 'way',
 'its',
 'government',
 'treats',
 'its',
 'people',
 'must',
 'be',
 'reformed',
 'https://t.co/DupN3cE2dd',
 'https://t.co/ee5KZJMm5R',
 'portland',
 'police',
 'bureau',
 'is',
 'preparing',
 'for',
 'a',
 'variety',
 'of',
 'mass',
 'gathering',
 'events',
 'being',
 'planned',
 'for',
 'saturday',
 'september',
 '26',
 '2020',
 'at',
 'this',
 'time',
 'one',
 'group',
 'has',
 'been',
 'announced',
 'they',
 'will',
 'hold',
 'an',
 'event',
 'at',
 'delta',
 'park',
 'at',
 'noon',
 '#policelivesmatter',
 '#polizeigewalt',
 '#portland',
 '#portlandprotests',
 'https://t.co/3JSoa0UcOL',
 'why',
 'lies',
 '...',
 'why',
 'hypocrisy',
 '...',
 'why',
 'security',
 'forces',
 '...',
 'why',
 'racism',
 '...',
 'why',
 'cut',
 'budgets',
 '...',
 'to',
 'deceive',
 'people',
 'no',
 'trump',
 'no',
 'no',
 'no',
 '#army',
 '#covidー19',
 '#biden',
 '#trump',
 'https://t.co/v1WYXyidzJ',
 'difference',
 '#biden',
 'https://t.co/Vl5lB7q2l6',
 'he',
 'h

In [None]:
words = words

In [224]:
tagged

[('america', 'NN'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('way', 'NN'),
 ('its', 'PRP$'),
 ('government', 'NN'),
 ('treats', 'VBZ'),
 ('its', 'PRP$'),
 ('people', 'NNS'),
 ('must', 'MD'),
 ('be', 'VB'),
 ('reformed', 'VBN'),
 ('<-url->', 'JJ'),
 ('<-url->', 'JJ'),
 ('poland', 'NN'),
 ('police', 'NN'),
 ('bureau', 'NN'),
 ('is', 'VBZ'),
 ('preparing', 'VBG'),
 ('for', 'IN'),
 ('a', 'DT'),
 ('variety', 'NN'),
 ('of', 'IN'),
 ('mass', 'NN'),
 ('gathering', 'NN'),
 ('events', 'NNS'),
 ('being', 'VBG'),
 ('planned', 'VBN'),
 ('for', 'IN'),
 ('saturday', 'JJ'),
 ('september', 'NN'),
 ('at', 'IN'),
 ('this', 'DT'),
 ('time', 'NN'),
 ('one', 'CD'),
 ('group', 'NN'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('announced', 'VBN'),
 ('they', 'PRP'),
 ('will', 'MD'),
 ('hold', 'VB'),
 ('an', 'DT'),
 ('event', 'NN'),
 ('at', 'IN'),
 ('delta', 'NN'),
 ('park', 'NN'),
 ('at', 'IN'),
 ('noon', 'JJ'),
 ('<-#->', 'JJ'),
 ('<-#->', 'JJ'),
 ('<-#->', 'JJ'),
 ('<-#->', 'JJ'),
 ('<-url->', 'JJ'),
 ('why', 'WRB'),
 ('

In [215]:
part_of_speech = dict(tagged)

In [208]:
def part_of_speech(iterable_of_lists):
    """
    """
    all_words = tokenize_list(iterable_of_lists)
    tokens = remove_stopwords(all_words)
    tagged = nltk.pos_tag(tokens)
    part_of_speech = dict(tagged)
    
    return pd.DataFrame(part_of_speech, 
                        index=part_of_speech.keys(),
                        columns=['count','part_of_speech']
                        ).astype({'count':'int64',
                                  'part_of_speech':'category'})
pos = part_of_speech(list_df.head(5))

ValueError: cannot convert float NaN to integer

In [None]:
def visualize(words,
              limit=100,
              color=(150,50,50)):
    """
    """
    cloud = WordCloud(background_color="white",
                  prefer_horizontal=0.9,
                  max_font_size=40,
                  relative_scaling=.5,
                  color_func=lambda *args,**kwargs:color)
    cloud.generate_from_frequencies(dict(word_freq[:limit]))
    
    fig, ax = plt.subplots()
    ax.imshow(cloud)
    ax.axis('off')
    
    return fig

In [None]:
def vectorize(df):
    """Convert a list of tokens to a matrix of token counts."""
    
    vectorizer = CountVectorizer()
    frequency_matrix = vectorizer.fit_transform(df)
    
    # Sum all the frequencies for each word
    total_count = np.sum(frequency_matrix, axis=0)
    
    # Squeeze to remove single-dimensional entries
    frequency = np.squeeze(np.asarray(sum_frequencies))
    
    # Make a dataframe of the words and their frequencies
    frequency_df = pd.DataFrame([frequency], columns=vectorizer.get_feature_names()).transpose()
    return frequency_df

In [None]:
def plot(word_frequency):
    """
    """
    labels = word_frequency[0][1:51].index
    title = 'Word Frequency'
    
    
    plt.figure(figsize=(10,5))
    plt.bar(np.arange(50), word_frequency[0][1:51], width = 0.8, color = sns.color_palette("bwr"), alpha=0.5, 
            edgecolor = "black", capsize=8, linewidth=1);
    plt.xticks(np.arange(50), labels, rotation=90, size=14);
    plt.xlabel("Word");
    plt.ylabel("Frequency", size=14);
    plt.title(title, size=18)
    plt.grid(False);
    plt.gca().spines["top"].set_visible(False);
    plt.gca().spines["right"].set_visible(False);
    plt.show() 

## Keywords

In [205]:
iran.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 560571 entries, 1271764746983952390 to 948848104362663936
Data columns (total 35 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   userid                    560571 non-null  string        
 1   user_display_name         560571 non-null  string        
 2   user_screen_name          560571 non-null  string        
 3   user_reported_location    417523 non-null  string        
 4   user_profile_description  530518 non-null  string        
 5   user_profile_url          338535 non-null  string        
 6   follower_count            560571 non-null  int64         
 7   following_count           560571 non-null  int64         
 8   account_creation_date     560571 non-null  datetime64[ns]
 9   account_language          560571 non-null  string        
 10  tweet_language            444758 non-null  string        
 11  tweet_text                560571 no

In [None]:
tweet_df = (iran
            .loc[:][['tweetid','userid','tweet_text']]

In [None]:
def split(data):
    X_train, X_test, y_train, y_test = train_test_split(data.tweet, table.sentiment, test_size=0.2, shuffle=True)
    return X_train, X_test, y_train, y_test

In [None]:
def tokenize(data, features):
    """
    """
    tokenization = TfidfVectorizer(max_features=features)
    tokenization.fit(dataset)
    return tokenization.transform(data).toarray()