In [1]:
import datetime
import functools
import itertools
import os
from pathlib import Path
import re

import numpy as np
import pandas as pd

## Join campaign datasets

In [2]:
def combine_csvs(directory):
    """Join multiple CSVs into a single file."""

    df = pd.DataFrame()
    
    p = Path(directory)

    for file in p.iterdir():
        temp = pd.read_csv(file, 
                         index_col=0,
                         low_memory=False,)
        temp['file'] = str(file.stem)
        temp['campaign'] = (file.name.split('_')[-5]
                        + file.name.split('_')[-4])
        temp['release'] = file.name.split('_')[-4]
        temp['government'] = file.name.split('_')[-5]
        df = df.append(temp)

    return df

## Generate dataset objects

In [3]:
class UsersData(object):
    """Class for generating dataframe from csv dataset of users."""
    def __init__(self, data_path):
        self.df = combine_csvs(data_path)
        self.fields = {'user_display_name':'string',
                        'user_screen_name':'string',
                        'user_reported_location':'string',
                        'user_profile_description':'string',
                        'user_profile_url':'string',
                        'follower_count':'int64',
                        'following_count':'int64',
                        'account_creation_date':'datetime64',
                        'account_language':'string',
                        'campaign':'string',
                        'government':'string',
                        'file':'string'}
        self.df = self.df.astype(self.fields)
    
    def __str__(self):
        return 'Dataset: {}'.format(name)
    
    def to_torch(self):
        return #torch.data.DataLoader(self.df)
    
    def to_tf(self):
        return tf.data.Dataset.from_tensor_slices(dict(self.df))
        
    def to_np(self):
        return np.to_array(self.df)
        

In [4]:
users = UsersData('../data/users')
users.df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 374 entries, GotPdi7ND5U93CiLruCUjAPr0R5af8PkKEZQ9UJHNlE= to umi7TfGCQ73OtREt5v8BMjNBHbg96LnkEmH65RO8Ts=
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   user_display_name         374 non-null    string        
 1   user_screen_name          374 non-null    string        
 2   user_reported_location    162 non-null    string        
 3   user_profile_description  274 non-null    string        
 4   user_profile_url          44 non-null     string        
 5   follower_count            374 non-null    int64         
 6   following_count           374 non-null    int64         
 7   account_creation_date     374 non-null    datetime64[ns]
 8   account_language          374 non-null    string        
 9   file                      374 non-null    string        
 10  campaign                  374 non-null    string        
 11  releas

In [5]:
class TweetsData(object):
    """Class for generating dataframe from csv dataset of tweets."""
    
    def __init__(self, data_path):
        self.df = combine_csvs(data_path)
        self.fields = {'userid':'string',
                        'user_display_name':'string',
                        'user_screen_name':'string',
                        'user_reported_location':'string',
                        'user_profile_description':'string',
                        'user_profile_url':'string',
                        'account_creation_date':'datetime64',
                        'account_language':'string',
                        'tweet_language':'string',
                        'tweet_text':'string',
                        'tweet_time':'datetime64',
                        'tweet_client_name':'category',
                        'in_reply_to_userid':'string',
                        'retweet_userid':'string',
                        'latitude':'category',
                        'longitude':'category',
                        'campaign':'string',
                        'government':'string',
                        'hashtags':'string',
                        'urls':'string',
                        'user_mentions':'string',
                        'file':'string'}
        
        self.df = self.df.astype(self.fields)
        self.df['type'] = self.df['is_retweet'].apply(
                            lambda x: 'retweet' if x == True else 'original')
        self.df['has_quote'] = self.df['quoted_tweet_tweetid'].notna()
        
    def __str__(self):
        return 'Dataset: {}'.format(name)
    
    def to_torch(self):
        return #torch.data.DataLoader(self.df)
    
    def to_tf(self):
        return tf.data.Dataset.from_tensor_slices(dict(self.df))
        
    def to_np(self):
        return np.to_array(self.df)

In [20]:
tweets = TweetsData('../data/tweets')
tweets.df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729129 entries, 1331706590525874184 to 1000259808442961920
Data columns (total 35 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   userid                    729129 non-null  string        
 1   user_display_name         729129 non-null  string        
 2   user_screen_name          729129 non-null  string        
 3   user_reported_location    545336 non-null  string        
 4   user_profile_description  684476 non-null  string        
 5   user_profile_url          448380 non-null  string        
 6   follower_count            729129 non-null  int64         
 7   following_count           729129 non-null  int64         
 8   account_creation_date     729129 non-null  datetime64[ns]
 9   account_language          729129 non-null  string        
 10  tweet_language            603007 non-null  string        
 11  tweet_text                729129 n

## Cleaning text

Turn hashtag, url, and user mentions columns into lists

In [18]:
df = tweets.df.loc[:][:]

In [8]:
def str_to_list(string):
    string = re.sub(r"[\[\]\']", "", string)
    return list(string.split(','))

In [9]:
def melt_list_column(df, col_name, id_var):
    mask = df[col_name].isin(['[]'])
    col = df[col_name][~mask]
    return (col
            # expand items into columns
            .str.replace(r"[\[\]\']", "", regex=True)
            .str.split(',', expand=True)
            # melt wide table into duplicated tweets
            .reset_index()
            .melt(id_vars=[id_var])
            .drop(columns=['variable'])
            .dropna()
           )

In [12]:
df['hashtags'] = df['hashtags'].map(str_to_list, na_action='ignore')

In [13]:
df['hashtags']

tweetid
1331706590525874184                        [haqqinaz,  haqqin,  haqqinz]
1100358276435398656    [Azerbaijan,  shareforkhojaly,  justiceforkhoj...
1100389340914569216                [shareforkhojaly,  justiceforkhojaly]
724982683118358528                                                    []
728142042765742080                                          [Azerbaijan]
                                             ...                        
1004340724949831681           [Syrianarmy,  Aleppo,  Terrorists,  Syria]
1001358869132644354                          [Dera,  terrorists,  Syria]
982141416179576835     [Refugees,  Euphrates,  Jnayna,  Syria,  news,...
979255677850935296     [US,  European,  Syria,  EasternGhouta,  Damas...
1000259808442961920                                              [Syria]
Name: hashtags, Length: 729129, dtype: object

In [21]:
df = tweets.df.loc[:][:]

hashtags = melt_list_column(df, 'hashtags', 'tweetid')
urls = melt_list_column(df, 'urls', 'tweetid')
mentions = melt_list_column(df, 'user_mentions', 'tweetid')

In [22]:
hashtags.head(10)

Unnamed: 0,tweetid,value
0,1331706590525874184,haqqinaz
1,1100358276435398656,Azerbaijan
2,1100389340914569216,shareforkhojaly
3,728142042765742080,Azerbaijan
4,650550450320601088,novosti_az
5,661914640331333632,novosti_az
6,535744142437933056,novosti_az
7,676719279950053376,novosti_az
8,563659939231457280,novosti_az
9,593663020993150977,novosti_az
