In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('../')

In [None]:
from ast import literal_eval
import copy
import os

import emoji
from IPython.display import display, HTML
import nltk
import numpy as np
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
import pandas as pd
from polyglot.text import Text

import _utils as utils

In [None]:
tqdm.pandas()

In [None]:
group = 'politicians'

In [None]:
sample_tweets_file = '../tweets/new_datasets/{group}.csv'.format(group=group)
features_dir = '../processed_features/'

In [None]:
df = pd.read_csv(sample_tweets_file, engine='python')
# raw_df = pd.read_csv(sample_tweets_file, low_memory=False)
# df = copy.deepcopy(raw_df.head(1000))

In [None]:
# with pd.option_context('display.max_rows', 500, 'display.max_columns', 50, 'display.max_colwidth', -1):
#     display(df)

In [None]:
df.dropna(subset=['tweet', 'mentions', 'hashtags', 'urls', 'photos'], inplace=True)

In [None]:
cleaner = utils.TweetCleaner()

In [None]:
df['temp_emotional_tweet'] = df.progress_apply(lambda tweet: cleaner.emotional_clean(tweet.tweet), axis=1)
df['temp_clean_tweet'] = df.progress_apply(lambda tweet: [word for word in tweet.temp_emotional_tweet 
                                                          if word not in emoji.UNICODE_EMOJI], axis=1)
df['tp_author'] = df.progress_apply(lambda tweet: tweet.username, axis=1)
df['tp_date'] = df.progress_apply(lambda tweet: tweet.date, axis=1)

df['tp_tweet_len'] = df.progress_apply(lambda tweet: len(tweet.tweet), axis=1)
df['tp_clean_tweet_len'] = df.progress_apply(lambda tweet: len(' '.join(tweet.temp_clean_tweet)), axis=1)

df['tp_sentiment'] = df.progress_apply(
    lambda tweet: np.mean([word.polarity for word in Text(' '.join(tweet.temp_clean_tweet), hint_language_code='en').words]) 
    if tweet.temp_clean_tweet else 0, axis=1)

df['tp_emojis_num'] = df.progress_apply(lambda tweet: len([word for word in tweet.temp_emotional_tweet 
                                                           if word in emoji.UNICODE_EMOJI]), axis=1)
df['tp_mentions_num'] = df.progress_apply(lambda tweet: len(literal_eval(tweet.mentions)), axis=1)
df['tp_hashtags_num'] = df.progress_apply(lambda tweet: len(literal_eval(tweet.hashtags)), axis=1)

df['tp_has_url'] = df.progress_apply(lambda tweet: bool(len(literal_eval(tweet.urls))), axis=1)
df['tp_has_image'] = df.progress_apply(lambda tweet: bool(len(literal_eval(tweet.photos))), axis=1)

df['tp_has_gif'] = df.progress_apply(lambda tweet: not pd.isnull(tweet.gif_url), axis=1)
df['tp_has_video'] = df.progress_apply(lambda tweet: not pd.isnull(tweet.video_url), axis=1)
df['tp_has_place'] = df.progress_apply(lambda tweet: not pd.isnull(tweet.place), axis=1)

df['tp_replies_count'] = df.progress_apply(lambda tweet: tweet.replies_count, axis=1)
df['tp_retweets_count'] = df.progress_apply(lambda tweet: tweet.retweets_count, axis=1)
df['tp_likes_count'] = df.progress_apply(lambda tweet: tweet.likes_count, axis=1)

df['tp_is_retweet'] = df.progress_apply(lambda tweet: tweet.username != tweet.scraped_user, axis=1)
df['tp_is_reply'] = df.progress_apply(lambda tweet: bool(tweet.is_reply_to), axis=1)
df['tp_is_quote'] = df.progress_apply(lambda tweet: bool(tweet.is_quote_status), axis=1)

df['tp_group'] = df.progress_apply(lambda tweet: tweet.group, axis=1)

In [None]:
cols_to_rm = [c for c in df.columns if not c.startswith('tp_')]

In [None]:
final_df = df.drop(cols_to_rm, axis=1)

In [None]:
with pd.option_context('display.max_rows', 500, 'display.max_columns', 50, 'display.max_colwidth', -1):
    display(final_df)

In [None]:
filename = '{group}_twitter_posts.csv'.format(group=group)
final_df.to_csv(os.path.join(features_dir, filename), sep=',', encoding='utf-8', index=False)