### How unique are the messages?

In [1]:
import sys
print(sys.version)
print(spark.version)

3.8.15 | packaged by conda-forge | (default, Nov 22 2022, 08:46:39) 
[GCC 10.4.0]
3.1.3


In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)
pd.reset_option('display.max_rows')
from itertools import compress 
from pyspark.sql.functions import *
from pyspark.sql.types import *
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings(action='ignore')
from pyspark.sql.functions import col

import re
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.feature import CountVectorizer,  IDF, CountVectorizerModel, Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row

In [3]:
pip install --upgrade regex

[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# enable repl.eagearEval
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

In [6]:
# load the data into spark dataframe
tweets = spark.read.parquet('gs://msca-bdp-students-bucket/shared_data/sshende/processed_data')

23/03/07 08:19:13 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


For this analysis we are only considering the tweets and not retweet

In [7]:
# create a dataframe that contains userid, user screen name, tweet, follower count and verified status
twitterers = tweets.select('id', 'screen_name', 'description', 'text', 'followers_count', 'verified') \
                                .filter((col('description').isNotNull()) & (col('verified').isNotNull()) & (col('retweeted_status').isNull()) & (col('quoted_status').isNull()))
twitterers

                                                                                

id,screen_name,description,text,followers_count,verified
1511608226797785094,Mumbejja22,Mum's cutie🥰,@Philemonkats Tha...,73,False
1306562642832535555,mlfkoo,wheels on the bus...,i just saw the sy...,861,False
2839680719,2blexx,Don't take this l...,@FS_Yusuf_ @AfamD...,268,False
2313624048,BryntegSchool,"Ewenny Road, Brid...",Year 7 attending ...,2622,False
42950485,drridpath,Job is Professor ...,@CheckAnfro @owta...,3075,False
1227924608,PEJHN,The latest inform...,What a way to end...,814,False
1291635658432421889,fableSimmer,"he/him, BLM, lgbt...",@odaphiv I think ...,886,False
1349051156501131265,AuthourizedKJV,#Acts2:38 #Method...,"3 years, don't qu...",82,False
4884639118,djsamank,this guy @dremtgi...,#NowPlaying When ...,70664,False
42622744,Cambslive,"The home of news,...",The pre-school sa...,88205,True


In [8]:
twitterers.count()

                                                                                

16531309

In [8]:
# Define keywords for different types of accounts
govt_keywords = ['government', 'official', 'politics', 'public service', 'state', 'federal', 'city', 'municipal', 'agency', 'parliament', 'elected', 'representative', 'legislation', 
                 'regulation', 'policy', 'public office', 'civil servant', 'bureaucracy', 'minister', 'governor', 'mayor', 'council', 'embassy', 'consulate', 'foreign affairs',
                 'homeland security', 'defense', 'military', 'law enforcement', 'justice', 'court', 'judge', 'prosecutor', 'immigration', 'customs', 'passport']
govt_keywords_str = '|'.join(govt_keywords)

edu_keywords = ['school', 'college', 'university', 'education', 'academy', 'students', 'professor', 'campus', 'degree', 'scholarship', 'research', 'faculty', 'lecture', 'curriculum',
                'syllabus', 'graduation', 'alumni', 'student government']
edu_keywords_str = '|'.join(edu_keywords)

nonprofit_keywords = ['non-profit', 'nonprofit', 'charity', 'foundation', 'ngo', 'non-governmental organization', 'social enterprise', 'community development', 'volunteer',
                      'service organization', 'social impact', 'public benefit organization','charitable trust', 'not-for-profit']
nonprofit_keywords_str = '|'.join(nonprofit_keywords)

news_keywords = ['news', 'media', 'journalism', 'press', 'reporting', 'reporter', 'anchor', 'editor', 'newspaper', 'broadcasting', 'newsroom', 'current events']
news_keywords_str = '|'.join(news_keywords)

influencer_keywords = ['influencer', 'blogger', 'vlogger', 'public figure', 'social media', 'coach', 'content creator', 'brand ambassador', 'fashionista', 'lifestyle guru', 
                       'fitness expert', 'beauty blogger', 'travel influencer', 'foodie', 'mommy blogger']
influencer_keywords_str = '|'.join(influencer_keywords)

health_keywords = ['health', 'healthcare', 'hospital', 'medical', 'doctor', 'nurse', 'wellness', 'public health', 'health promotion', 'health education', 'health policy',
                      'health services', 'health research', 'healthcare provider', 'healthcare system', 'disease']
health_keywords_str = '|'.join(health_keywords)

# Identify account type based on presence of keywords in user name or description
twitterers_account_type = twitterers.withColumn('account_type',
                            when(twitterers['verified'] == False, 'Other')
                           .when((twitterers['verified'] == True) &
                               (lower(twitterers['screen_name']).rlike(govt_keywords_str)) | 
                                (lower(twitterers['description']).rlike(govt_keywords_str)), 
                                'Government Entities')
                           .when((twitterers['verified'] == True) &
                               (lower(twitterers['screen_name']).rlike(edu_keywords_str)) | 
                                (lower(twitterers['description']).rlike(edu_keywords_str)), 
                                'Education Organizations')
                           .when((twitterers['verified'] == True) &
                               (lower(twitterers['screen_name']).rlike(nonprofit_keywords_str)) | 
                                (lower(twitterers['description']).rlike(nonprofit_keywords_str)), 
                                'Non-Profit Organizations')
                           .when((twitterers['verified'] == True) &
                               (lower(twitterers['screen_name']).rlike(news_keywords_str)) | 
                                (lower(twitterers['description']).rlike(news_keywords_str)), 
                                'News Outlets')
                           .when((twitterers['verified'] == True) &
                               (lower(twitterers['screen_name']).rlike(health_keywords_str)) | 
                                (lower(twitterers['description']).rlike(health_keywords_str)), 
                                'Health Organizations')
                           .when((twitterers['verified'] == True) &
                               (lower(twitterers['screen_name']).rlike(influencer_keywords_str)) | 
                                (lower(twitterers['description']).rlike(influencer_keywords_str)) &
                                (twitterers['followers_count'] > 5000), 
                                'Social Media Influencers')
                           .otherwise('Other'))

twitterers_account_type.limit(5)

                                                                                

id,screen_name,description,text,followers_count,verified,account_type
31135446,gordonluckett,Gordon Luckett is...,Vote for my class...,323,False,Other
758112005538484224,USLowcountry,University School...,Why USL? Our stud...,87,False,Other
1063178622,SUVtv,Sports Network - ...,🚨@JustinWBrantle...,9878,False,Other
78143825,lizerlfunk,I teach math. I a...,@moontwinie @_Mar...,185,False,Other
1456977308607258631,Shitposting224,Her/Hers - 🏳️‍🌈...,Man school is som...,2,False,Other


In [9]:
# create a dataframe with account_type and text
tweets_by_account_type = twitterers_account_type.select('account_type', 'text').filter(col('account_type').isNotNull() & col('text').isNotNull())
tweets_by_account_type.limit(5)

                                                                                

account_type,text
Other,@Philemonkats Tha...
Other,i just saw the sy...
Other,@FS_Yusuf_ @AfamD...
Other,Year 7 attending ...
Other,@CheckAnfro @owta...


In [10]:
print("Number of rows:", tweets_by_account_type.count())
print("Number of columns:", len(tweets_by_account_type.columns))



Number of rows: 16531309
Number of columns: 2


                                                                                

In [10]:
# save the tweets by account type dataframe as a parquet file
tweets_by_account_type.write.format("parquet").\
mode('overwrite').\
save('gs://msca-bdp-students-bucket/shared_data/sshende/tweets_by_account')

                                                                                

In [28]:
# read the data
tweets_by_account_type = spark.read.parquet('gs://msca-bdp-students-bucket/shared_data/sshende/tweets_by_account')

In [11]:
tweets_by_account_type.groupBy('account_type').count()

                                                                                

account_type,count
Government Entities,111359
Other,16151242
Non-Profit Organi...,5892
Health Organizations,4068
News Outlets,193952
Social Media Infl...,2417
Education Organiz...,62379
