## Notebook: analyze_text.ipynb

This notebook extracts features from user tweets and standardizes the data at the end.

In [2]:
import pandas as pd
import nltk
from nltk.tokenize import TweetTokenizer, RegexpTokenizer

nltk.download('stopwords')
nltk.download('vader_lexicon')

from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

df = pd.read_json('../datasets/MIB/mib_processed.json')
classifier_columns = ['identification', 'geo_enabled','default_profile','default_profile_image','followers_count','friends_count','favourites_count','listed_count','retweet_post_percent','reply_post_percent','avg_hashtags','avg_urls','avg_mentions','avg_retweets_cnt','avg_reply_cnt', 'unique_words_per_word', 'avg_neg_sentiment', 'avg_neu_sentiment', 'avg_pos_sentiment']
tweets_columns = ['identification', 'tweet']
tweets_df = pd.DataFrame(columns=tweets_columns)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\spdev\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\spdev\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
# out of new_columns, only unique_words_per_word, avg_neg_sentiment, and avg_pos_sentiment are used in classifiers
new_columns = ['num_unique_words', 'total_words', 'unique_words_per_word',
               'avg_neg_sentiment', 'avg_neu_sentiment', 'avg_pos_sentiment']

# add new columns to the dataframe if not already present
def add_columns(df, columns):
    for col in columns:
        if col not in df.columns:
            df[col] = None

# standardize all numeric values in the dataframe
def standardize(df, final_columns):
    df = df[final_columns]
    
    # find all columns that are not boolean values
    numeric_cols = [col for col in df if (col != 'recent_tweet_text' and len(df[col].dropna().unique()) > 2)]
    temp_df = df[numeric_cols]
    df[numeric_cols] = (temp_df - temp_df.mean())/temp_df.std()
    return df.fillna(0)

add_columns(df, new_columns)

In [4]:
tokenizer = RegexpTokenizer(r'\w+')
sw = set(stopwords.words('english'))

# some tweets are in italian
sw.update(set(stopwords.words('italian')))

sentiment_analyzer = SentimentIntensityAnalyzer()

In [20]:
tweets_list = []
identification_list = []

for index, row in df.iterrows():    
    tweets = row['recent_tweet_text']
    total_words = 0
    words = set()
    neg_sentiment = neu_sentiment = pos_sentiment = 0
    
    for tweet in tweets:
        if tweet:
            # add tweet to tweets dataset
            tweets_list.append(tweet)
            identification_list.append(row['identification'])
            
            # extract average sentiment across user's tweets
            polarity_scores = sentiment_analyzer.polarity_scores(tweet)
            neg_sentiment += polarity_scores['neg']
            neu_sentiment += polarity_scores['neu']
            pos_sentiment += polarity_scores['pos']
            
            # get a set of all unique words and count of total words across user's tweets
            tkn = tokenizer.tokenize(tweet)
            total_words += len(tkn)
            words.update({word for word in tkn if word not in sw})
    
    df.at[index, 'num_unique_words'] = len(words)
    df.at[index, 'total_words'] = total_words
    df.at[index, 'avg_neg_sentiment'] = neg_sentiment/len(tweets)
    df.at[index, 'avg_neu_sentiment'] = neu_sentiment/len(tweets)
    df.at[index, 'avg_pos_sentiment'] = pos_sentiment/len(tweets)

In [21]:
df['unique_words_per_word'] = df['num_unique_words']/df['total_words']
tweets_df = pd.DataFrame.from_dict({'identification': identification_list, 'tweet': tweets_list})

In [24]:
standardized_df = standardize(df, classifier_columns)
standardized_df.to_csv('../datasets/MIB/mib_processed_text_standardized.csv', index=False)

tweets_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,identification,tweet
0,bot,everyone has a dream http://t.co/B95BIM3b
1,bot,Snowing
2,bot,2012wish me well
3,bot,Happy new year 2012
4,bot,Happy new year 2012
...,...,...
253771,human,WHERES JAZLYN
253772,human,@YoongisSnapback its here lol
253773,human,@YoongisSnapback no I didn't..
253774,human,!!!! http://t.co/EA9ZAuIxQu
