## Notebook: analyze_text.ipynb

This notebook extracts features from user tweets and standardizes the data at the end.

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
df = pd.read_json('../datasets/MIB/mib_processed.json')
classifier_columns = ['identification', 'geo_enabled','default_profile','default_profile_image','followers_count','friends_count','favourites_count','listed_count','retweet_post_percent','reply_post_percent','avg_hashtags','avg_urls','avg_mentions','avg_retweets_cnt','avg_reply_cnt', 'unique_words_per_word']

In [75]:
new_columns = ['num_unique_words', 'total_words', 'unique_words_per_word']

def add_columns(df, columns):
    for col in columns:
        if col not in df.columns:
            df[col] = None

def standardize(df, final_columns):
    df = df[final_columns]
    
    # find all columns that are not boolean values
    numeric_cols = [col for col in df if (col != 'recent_tweet_text' and len(df[col].dropna().unique()) > 2)]
    temp_df = df[numeric_cols]
    df[numeric_cols] = (temp_df - temp_df.mean())/temp_df.std()
    return df

add_columns(df, new_columns)

In [76]:
tokenizer = RegexpTokenizer(r'\w+')
sw = set(stopwords.words('english'))

# some tweets are in italian
sw.update(set(stopwords.words('italian')))

In [78]:
for index, row in df.iterrows():
    tweets = row['recent_tweet_text']
    total_words = 0
    words = set()
    for tweet in tweets:
        if tweet:
            tkn = tokenizer.tokenize(tweet)
            total_words += len(tkn)
            words.update({word for word in tkn if word not in sw})
            tokens.append(tkn)
    
    df.at[index, 'num_unique_words'] = len(words)
    df.at[index, 'total_words'] = total_words

df['unique_words_per_word'] = df['num_unique_words']/df['total_words']

In [80]:
standardized_df = standardize(df, classifier_columns)