## A Social Network Intervention for Improving Organ Donation Awareness in the US

### Objective: 
The study aims to track the digital markers of organ donation in the United States on Twitter

### Importing Required Packages

In [1]:
import argparse
from urllib.parse import urlparse
import urllib
import re
import pandas as pd
import numpy as np
from city_to_state import city_to_state_dict
from two_letter_states import us_state_abbrev
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
## Uncomment the lines below to install tweepy, us packages
import sys
#!{sys.executable} -m pip install tweepy
#!{sys.executable} -m pip install us
import tweepy
import us

### Access Twitter API in Python

In [2]:
def tw_oauth(authfile):
    with open(authfile, "r") as f:
        ak = f.readlines()
    f.close()
    auth1 = tweepy.auth.OAuthHandler(ak[0].replace("\n", ""), ak[1].replace("\n", ""))
    auth1.set_access_token(ak[2].replace("\n", ""), ak[3].replace("\n", ""))
    return tweepy.API(auth1)

# OAuth key file
authfile = './auth.k.txt'
api = tw_oauth(authfile)

### Twitter Data Extraction 

In [3]:
search_words = "organ donation OR Transplant OR organ donor"
date_since = "2021-08-11"
date_until = "2021-12-31"
max_tweets = 1500
   
# Collect tweets
tweets = tweepy.Cursor(api.search,
              q = search_words,
              #lang="en",
              since=date_since,
              until=date_until,         
              tweet_mode = "extended").items(max_tweets)

data = [[tweet.author.name, tweet.author.screen_name, tweet.author.location, 
         int(tweet.author.geo_enabled),        
         tweet.place.name if hasattr(tweet.place, 'name') else None,
         tweet.place.place_type if hasattr(tweet.place, 'place_type') else None,
         tweet.place.full_name if hasattr(tweet.place, 'full_name') else None,
         
         tweet.place.country_code if hasattr(tweet.place, 'country_code') else None,
         
         tweet.author.description, 
         int(tweet.author.protected), int(tweet.author.verified), tweet.author.followers_count,
         tweet.author.friends_count, tweet.author.listed_count, tweet.author.favourites_count, 
         tweet.author.statuses_count, tweet.author.created_at, tweet.created_at, 
         tweet.retweeted_status.full_text if hasattr(tweet, 'retweeted_status') else tweet.full_text,
         tweet.source, 
         tweet.retweet_count, tweet.favorite_count, tweet.lang,        
         tweet.entities['hashtags'][0]['text'] if tweet.entities['hashtags'] else None,
         tweet.entities.media if hasattr(tweet.entities, 'media') else None,
         tweet.entities.poll if hasattr(tweet.entities, 'poll') else None,
         1 if tweet.entities['urls'] else 0,
         int(tweet.author.default_profile), int(tweet.author.default_profile_image)       
        ]
          for tweet in tweets]



df = pd.DataFrame(data=data, 
                    columns=['User_Name', 'Screen_Name', 'User_Location', 'User_Geo_Enabled', 
                             'Tweet_Geo_Location', 'Tweet_Geo_LocationType', 'Tweet_Geo_LocationFull', 
                             'Tweet_Geo_CountryCode',
                             'User_Description', 
                             'User_Protected',
                            'User_Verified', 'User_Followers_Count', 'User_Friends_Count', 'User_Listed_Count',
                            'User_Favorites_Count', 'User_Status_Count', 'User_Since', 'Tweet_Createdon', 'Tweet_Text', 
                             'Tweet_Source', 'Retweet_Count', 
                            'Tweet_Favorites_Count', 'Tweet_Language', 'Tweet_Hashtags', 'Tweet_HasMedia', 'Tweet_HasPoll',
                             'User_URL', 'User_Default_Profile', 'User_Default_Image'
                            ])

#df
#df[df['User_Default_Image'].notnull()]
#df[df['User_Default_Image']]

### Cleaning the User Location

US States are identified from tweet/author location (which ever is available)

In [4]:
def get_state_abbr(x):
    if re.match('({})'.format("|".join(us_state_abbrev.keys()).lower()), x.lower()):
        tokens = [re.match('({})'.format("|".join(us_state_abbrev.keys()).lower()), x.lower()).group(0)]
    elif re.match('({})'.format("|".join(city_to_state_dict.keys()).lower()), x.lower()):
        k = re.match('({})'.format("|".join(city_to_state_dict.keys()).lower()), x.lower()).group(0)
        tokens = [city_to_state_dict.get(k.title(), np.nan)]
    else:
        tokens = [j for j in re.split("\s|,", x) if j not in ['in', 'la', 'me', 'oh', 'or']]
    for i in tokens:
        if re.match('\w+', str(i)):
            if us.states.lookup(str(i)):
                return us.states.lookup(str(i)).abbr

def Get_US_States(row):
    if row['Tweet_Geo_LocationFull']:
        value = get_state_abbr(str(row['Tweet_Geo_LocationFull']).upper())
    elif row['User_Location']:
        value = get_state_abbr(str(row['User_Location']).upper())
    else:
        value = None
    return value            
            
df['US_State'] = df.apply(Get_US_States, axis=1)
df['User_Since_Years'] = pd.to_datetime("today").year - pd.to_datetime(df['User_Since']).dt.year

### Saving the twitter output every iteration

Since the standard API allows to retrieve tweets up to 7 days ago, we are continuously saving them in the 'Combined_result' dataframe. The duplicate tweets are identified and removed.

In [5]:
df.to_excel(r'./Twitter_Output_Temp.xlsx')
df_USA = df[df['US_State'].notnull()]

## Let us import the 'Twitter_Output.xlsx' file with all tweets that we saved so far
df_original = pd.read_excel(r'./Twitter_Output.xlsx')

## Adding newer tweets to the 'Twitter_Output.xlsx'
Combined_result = pd.concat([df_USA, df_original])
Combined_result = Combined_result[~Combined_result.duplicated(['User_Name', 'Tweet_Createdon', 'Tweet_Text'])]
Combined_result = Combined_result.reset_index(drop=True)
Combined_result.to_excel(r'./Twitter_Output.xlsx', index = False)

### On to Machine Learning

Now we can use a pre-trained Twitter Account Type Classification model using TFIDF + Logistic Regression Classifier. We are going to create a new column 'Individual'. 0 value indicates that the tweet is from an Organization.

**Assumption:** If the tweet description (bio) is empty, then it is assumed to be an individual

In [6]:
import joblib

vectorizer = "vectorizer.pkl"
model = "Twitter_accounttype_classification_model.pkl"  

loaded_vectorizer = joblib.load(vectorizer)
loaded_model = joblib.load(model)

Combined_result['User_Description'] = Combined_result['User_Description'].fillna('Individual')
X_test = Combined_result['User_Description']
X_test_tfidf = loaded_vectorizer.transform(X_test)
y_predicted = loaded_model.predict(X_test_tfidf)
Combined_result['Individual'] = y_predicted
Combined_result.to_excel(r'./Twitter_Output.xlsx', index = False)


### Tweet Analysis

Now that we have identified the Twitter user account type, let us look at some trends across US

#### Organ Donation Tweets by US States

In [None]:
cr_counts = Combined_result.groupby(['US_State']).size().reset_index(name='# Organ Donation Related Tweets')
cr_counts = cr_counts.sort_values('# Organ Donation Related Tweets', ascending=False)

####### Map ########

#!{sys.executable} -m pip install plotly
import plotly.express as px
fig = px.choropleth(cr_counts,
                    locations='US_State',
                    color='# Organ Donation Related Tweets',
                    color_continuous_scale='blues',
                    hover_name='US_State',
                    locationmode='USA-states',
                    scope='usa')

# Add abbrievated State Labels
fig.add_scattergeo(
    locations=cr_counts['US_State'],
    locationmode='USA-states',
    text=cr_counts['US_State'],
    mode='text')

# Add Map title
fig.update_layout(
    
    title={'text':'Organ Donation Related Tweets by US State Since Aug 11, 2021',
           'xanchor':'center',
           'yanchor':'top',
           'x':0.5}
)

fig.show()

####### Bar chart ########

# function to add value labels
def addlabels(x,y):
    for i in range(len(x)):
        plt.text(i, y.iloc[i] + 1.5, y.iloc[i], ha = 'center')
        
x = cr_counts[0:9]['US_State']
y = cr_counts[0:9]['# Organ Donation Related Tweets']
fig, ax = plt.subplots(figsize =(12, 6))
 
# creating the bar plot
plt.bar(x, y, 
        #color ='blue',
        width = 0.4)
 
plt.xlabel("US States (Top 10)")
plt.ylabel("# Organ Donation Related Tweets")
plt.title("# of Organ Donation Related Tweets by US State (Top 10)")

# Add annotation to bars
addlabels(x, y)

# Add Text watermark
fig.text(0.85, 0.8, 'Since Aug 11, 2021', fontsize = 12,
         color ='black', ha ='right', va ='bottom',
         alpha = 1)
_ = plt.plot()

#### Organ Donation Tweets by US States and User Account Type

In [None]:
def f(x):
    x['Type_Individual'] = x['Individual']
    x['Type_Organization'] = 1-x['Individual']
    return x

cr_counts = Combined_result.apply(f, axis=1)
cr_counts = cr_counts.groupby('US_State').agg({'Type_Individual': 'sum', 'Type_Organization': 'sum', 'User_Name': 'count'})
cr_counts = cr_counts.reset_index()
cr_counts = cr_counts.sort_values('User_Name', ascending=False)

x = cr_counts[0:9]['US_State']
y = cr_counts[0:9]['Type_Individual']
z = cr_counts[0:9]['Type_Organization']

X_axis = np.arange(len(x))

##### # Chart
fig, ax1 = plt.subplots(figsize=(8,6))  
ax1.bar(X_axis - 0.2, y, 0.4, label = 'Individual')
ax1.bar(X_axis + 0.2, z, 0.4, label = 'Organization')
  
plt.xticks(X_axis, x)
plt.xlabel("US States")
plt.ylabel("# of Tweets")
plt.title("Organ Donation Related Tweets by Account Type")
plt.legend()
plt.show()

##### % Chart
import matplotlib.ticker as mtick

def f(x):
    x['% Tweets by Organization'] = x['Type_Organization'] / x['User_Name']
    return x

cr_counts = cr_counts.apply(f, axis=1)
cr_counts = cr_counts.reset_index()
cr_counts = cr_counts.sort_values(['User_Name', '% Tweets by Organization'], ascending=False)


x = cr_counts[0:9]['US_State']
y = cr_counts[0:9]['% Tweets by Organization']
fig, ax1 = plt.subplots(figsize=(8,6))  
ax1.bar(X_axis, y, 0.5, 
        label = '% Tweets by Organization',
        color = 'darkorange')
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1))
plt.xticks(X_axis, x)
plt.xlabel("US States")
plt.ylabel("% of Tweets by Organization")
plt.title("Organ Donation Related Tweets by Organization")
plt.legend()
plt.show()

### Twitter Sentiment Analysis 

In [None]:
#!{sys.executable} -m pip install textblob
#!{sys.executable} -m pip install nltk
#!{sys.executable} -m pip install wordcloud
#!{sys.executable} -m pip install langdetect
#nltk.download('vader_lexicon')
from textblob import TextBlob
import nltk
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from langdetect import detect
from nltk.stem import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer

Let us clean the twitter text first

In [None]:
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"http", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"@\S+", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ", regex=True)
    df[text_field] = df[text_field].str.replace(r"@", "at", regex=True)
    df[text_field] = df[text_field].str.lower()
    return df
Twitter_Final = Combined_result[~Combined_result['Tweet_Text'].isnull()]
Twitter_Final['Tweet_Text_Clean'] = Twitter_Final['Tweet_Text']
Twitter_Final = standardize_text(Twitter_Final, 'Tweet_Text_Clean')
Twitter_Final[['Tweet_Text', 'Tweet_Text_Clean']]


Let us now use Textblob to calculate positive, negative, neutral, polarity and compound parameters from the text

In [None]:
Twitter_Final[['Polarity', 'Subjectivity']] = Twitter_Final['Tweet_Text_Clean'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))
for index, row in Twitter_Final['Tweet_Text_Clean'].iteritems():
    score = SentimentIntensityAnalyzer().polarity_scores(row)
    neg = score['neg']
    neu = score['neu']
    pos = score['pos']
    comp = score['compound']
    if neg > pos:
        Twitter_Final.loc[index, 'Sentiment'] = 'Negative'
    elif pos > neg:
        Twitter_Final.loc[index, 'Sentiment'] = 'Positive'
    else:
        Twitter_Final.loc[index, 'Sentiment'] = 'Neutral'
        Twitter_Final.loc[index, 'Negative'] = neg
        Twitter_Final.loc[index, 'Neutral'] = neu
        Twitter_Final.loc[index, 'Positive'] = pos
        Twitter_Final.loc[index, 'Compound'] = comp
Twitter_Final.to_excel(r'./Twitter_Output.xlsx', index = False)        
        

#### Stop Words removal

Let us first look at the stop words of the english language from the nltk library. We can create our custom stop word list specific to our project.

In [None]:
#!{sys.executable} -m pip install nltk
import nltk
stopwords = nltk.corpus.stopwords.words('english')
stopwords

Retaining some words that would be useful during the user account classification

In [None]:
not_stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
                 'herself', 'us']  
stopwords = [ele for ele in stopwords if ele not in not_stop_words]

Defining the function to remove stopwords from the description tokens

In [None]:
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

#applying the function
df_user_labelled['User_Description_Tokens'] = df_user_labelled['User_Description_Tokens'].apply(lambda x:remove_stopwords(x))

df_user_labelled