# Attributes and Select Authors

In [1]:
import pandas as pd
import glob
import pickle

In [2]:
def grabFiles(filepath):
    """
        Given a filepath
        return an array of the files with the '*.json' extention.
    """
    return glob.glob(filepath+'*.json')

In [3]:
p = '2021/01/25/19/decompress/'
filenames_arr = grabFiles(p)

In [4]:
df = pd.read_json(filenames_arr[0],lines=True)

# Attribute Catalog

There are 37 attributes per record in the dataset. Some interesting ones are:
- contributors: a twitter account that tweets under another account name. (https://www.techinasia.com/twitter-contributor)
- created_at: time a tweet is created (utc)
- lang: language of the text message
- reply_count: the number of replies that the message recieved
- text: the text in the tweet
-Users: an object containing information about users

In [5]:
def numberAttributes(dataframe):
    """
        Given a dataframe,
        return the number of attributes in the dataframe.
    """
    return len(list(dataframe.columns))

def attributes(dataframe):
    """
        Given a dataframe,
        return a list of all the attributes.
    """
    return list(dataframe.columns)

In [6]:
print(numberAttributes(df))

37


In [7]:
print(attributes(df))

['contributors', 'coordinates', 'created_at', 'delete', 'display_text_range', 'entities', 'extended_entities', 'extended_tweet', 'favorite_count', 'favorited', 'filter_level', 'geo', 'id', 'id_str', 'in_reply_to_screen_name', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'is_quote_status', 'lang', 'place', 'possibly_sensitive', 'quote_count', 'quoted_status', 'quoted_status_id', 'quoted_status_id_str', 'quoted_status_permalink', 'reply_count', 'retweet_count', 'retweeted', 'retweeted_status', 'source', 'text', 'timestamp_ms', 'truncated', 'user']


In [8]:
c = [df['contributors'],df['created_at'],df['lang'],df['reply_count'],df['text'],df['user']]
df_rev = pd.concat(c,axis=1)
print(numberAttributes(df_rev))

6


# Users in the dataset that speak english

There are 945 distinct screennames that speak English in the dataset.

Reference:
https://stackoverflow.com/questions/29325458/dictionary-column-in-pandas-dataframe/29330853

In [9]:
def getScreenName(df):
    """
        Given a dataframe containing a user object,
        return a dataframe containing the screen_names stored in the users object.
    """
    return df['user'].apply(pd.Series)['screen_name']

"""
def speakEnglish(df):
        Given a dataframe containing languages,
        return dataframe with a new column that has True for english speakers.
    return df['english_speaker'] = df['lang']=='en'
"""
    
def speakEnglishScreenNames(df):
    """
        Given a dataframe containing screennames and english_speaker,
        return an array of distinct screennames that speak english.
    """
    return df['screen_name'].where(df['lang'] == 'en').dropna(axis=0).unique()

In [10]:
u = getScreenName(df_rev)

df_rev = df_rev.drop(['user'],axis=1)
c = [df_rev,u]

df_rev = pd.concat(c,axis=1)

print(numberAttributes(df_rev))
print(df_rev.head(5))

6
   contributors          created_at lang  reply_count  \
0           NaN 2021-01-26 02:36:00   ja          0.0   
1           NaN 2021-01-26 02:36:00   it          0.0   
2           NaN 2021-01-26 02:36:00   ja          0.0   
3           NaN 2021-01-26 02:36:00   pt          0.0   
4           NaN 2021-01-26 02:36:00   pt          0.0   

                                                text      screen_name  
0  RT @miyabi39mama: Êò†ÁîªÊâπË©ï„ÅåÁÇé‰∏ä„Åó„Å¶„ÅÑ„Çã‰ª∂\nÁä∂Ê≥Å„ÅØ„ÄÅ„Å™„Çì„Å®„Å™„Åè„Çè„Åã„Çä„Åæ„Åó...  nighthawkf117aj  
1               RT @vanessabschrodr: in tro me ti da   eduardazarnott  
2                                  @prism_pf „Å´„ÇÉ„Éº„Éº„Éº„ÅÑüòª     gomi_naotaka  
3              Sarah parece uma gabi sem boca KKKKKK          nalutop  
4  mentira que o lucas se juntou com o n*go di q ...          leighsx  


In [11]:
e = speakEnglishScreenNames(df_rev)

In [12]:
print(len(e))

945


In [13]:
pickle.dump(e, open( "english_speaking_authors.p", "wb" ) )