In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# TextBlob - Python library for processing textual data
from textblob import TextBlob

#GeoText to get country alpha-2 codes and identify cities and countries in text
!pip install GeoText
from geotext import GeoText

#libraries to extract country name from cities and textual data
!pip install geopandas
!pip install geopy
import geopandas
import geopy
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim

#time library to space-out requests
import time

#pycountry library to get alpha-2 country and continent codes
!pip install pycountry-convert
from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2

#Using folium maps to create visualization
!pip install folium
import folium
from folium.plugins import MarkerCluster

from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('../input/pfizer-vaccine-tweets/vaccination_tweets.csv')
data.head()

In [None]:
data2 = data.copy()

In [None]:
data.shape

In [None]:
print('no. of unique users:', len(data['user_name'].unique()))

*There are users who have tweeted more than once.*

In [None]:
username_counts = data['user_name'].value_counts()
pd.DataFrame(username_counts[username_counts>1]).reset_index().rename(columns = {'index': 'username', 'user_name': 'counts'})

In [None]:
usernamecounts_dict = dict(username_counts[username_counts>1])
usernamecounts_dict

In [None]:
data['user_location'].value_counts()

In [None]:
data_loc = data[['id', 'user_name', 'user_location']]
data_loc.head()

In [None]:
data_loc.shape

In [None]:
loc_obj_types = list()
loc_obj_types = [type(x) for x in data_loc['user_location'] if (type(x) != str)]
print('no. of data types other than string', len(set(loc_obj_types)))
print('no. of objects that are not string', len(loc_obj_types))
print('non-string "user_locations" are ', (len(loc_obj_types)/data_loc.shape[0])*100, '%')

In [None]:
data_loc.dropna(subset=['user_location'], inplace = True)
data_loc.shape

In [None]:
data_loc['user_location'] = data_loc['user_location'].progress_apply(lambda x: x.title() if (type(x) == str) else 'Unknown')
data_loc['country_location'] = data_loc['user_location'].progress_apply(lambda x: GeoText(str(x)).countries[0] if len(GeoText(str(x)).countries) != 0 else 'Unknown')
data_loc['city_location'] = data_loc['user_location'].progress_apply(lambda x: GeoText(str(x)).cities[0] if len(GeoText(str(x)).cities) != 0 else 'Unknown')
data_loc.head()

In [None]:
data_loc = data_loc[~((data_loc['country_location'] == 'Unknown') & (data_loc['city_location'] == 'Unknown'))]

In [None]:
data_loc.shape

*Using RateLimiter to gap-out requests to geocode servers.*

In [None]:
locator = Nominatim(user_agent='myGeocoder')
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

In [None]:
data_loc['city_location'].value_counts()

In [None]:
data_loc['country_location'].value_counts()

In [None]:
data_loc.head()

In [None]:
data_loc['country_location2'] = data_loc.progress_apply(lambda x: geocode(x['city_location']).address.split(', ')[-1:][0] if((x['country_location'] == 'Unknown') and (x['city_location'] != 'Unknown')) else x['country_location'], axis = 1)
data_loc.head()

In [None]:
data_loc.drop(['country_location'], axis = 1, inplace = True)
data_loc.rename(columns = {'country_location2': 'country_location'}, inplace = True)
data_loc.head()

In [None]:
data_loc['country_location'].value_counts()

In [None]:
def translate_text(text):
    if ('/' in text):
        text = text.split('/')[1].strip()
    Text = TextBlob(u'"'+text+'"')
    time.sleep(1)
    if(Text.detect_language() != 'en'):
        print(text)
        if(Text.detect_language() == 'el'):
            return 'Greece'        
        time.sleep(1)
        try:    
            return str(Text.translate(to='en')).strip('"')
        except:
            return text
    else:
        time.sleep(1)
        return text

In [None]:
locs = list(data_loc['country_location'])
u_locs = list(set(locs))
print(len(locs))
print(len(u_locs))

In [None]:
loc_trans = {text: translate_text(text) for text in u_locs}
print(loc_trans)

In [None]:
data_loc['country_location2'] = data_loc['country_location'].apply(lambda x: loc_trans[x])
data_loc.head()

In [None]:
data_loc['country_location2'].value_counts()

In [None]:
def get_continent_code(text):
    try:
        return country_alpha2_to_continent_code(text)
    except:
        return ' Un'

In [None]:
def get_country_code(text):
    try:
        return country_name_to_country_alpha2(text)
    except:
        return 'Un'

In [None]:
data_loc['country_code'] = data_loc['country_location2'].progress_apply(lambda x: get_country_code(x))
data_loc.head()

In [None]:
data_loc[data_loc['country_code'] == 'Un']['country_location2'].value_counts()

In [None]:
difcount_codes = {'The United Arab Emirates' : 'AE', 'The Netherlands': 'NL', 'Asia': 'IN', 'Saudi': 'SA', 'Chili': 'CL', 'Luzon': 'PH', 'Sri Lanka Sri Lanka': 'LK', 'Free Kashmir': 'IN', 'Swiss': 'CH'}

In [None]:
list(difcount_codes.keys())

In [None]:
data_loc['country_code'] = data_loc.progress_apply(lambda x: difcount_codes[x['country_location2']] if (x['country_code'] == 'Un') else x['country_code'], axis = 1)
data_loc.head()

In [None]:
data_loc['country_code'].value_counts()

In [None]:
data_loc['continent_code'] = data_loc['country_code'].progress_apply(lambda x: get_continent_code(x))
data_loc.head()

In [None]:
u_countrycodes = list(set(list(data_loc['country_code'])))
print(len(u_countrycodes))
print(u_countrycodes)

In [None]:
geolocator = Nominatim(user_agent='myGeocoder')
def geolocate(country):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(country)
        # And return latitude and longitude
        return (loc.latitude, loc.longitude)
    except:
        # Return missing value
        return (np.nan, np.nan)

In [None]:
country_coords = {cc: geolocate(cc) for cc in u_countrycodes}
print(country_coords)

In [None]:
for i,v in country_coords.items():
    if(v == (np.nan, np.nan)):
        print(i)
        if( i == 'IN'):
            country_coords[i] = geolocate('India')
        elif( i == 'IL'):
            country_coords[i] = geolocate('Israel')
        elif(i == 'ET'):
            country_coords[i] = geolocate('Ethiopia')
print(country_coords)

*Geopy's Geocode function could not return the correct coordinates for regions based on their country codes. Regions such as Cayman Islands ('KY'), Albania ('AL') etc were mis-identified as U.S. states.*

*I recently came across a gist of country data much like what we are after over here. I'm using the coordinates specified through this publicly available csv file.*

In [None]:
country_data = pd.read_csv('https://gist.githubusercontent.com/cpl/3dc2d19137588d9ae202d67233715478/raw/3d801e76e1ec3e6bf93dd7a87b7f2ce8afb0d5de/countries_codes_and_coordinates.csv')
country_data.head()

In [None]:
country_data['Country'] = country_data['Country'].progress_apply(lambda x: str(x))
country_data['Alpha-2 code'] = country_data['Alpha-2 code'].progress_apply(lambda x: str(x.replace('"', "").strip(' ')))
country_data['Alpha-3 code'] = country_data['Alpha-3 code'].progress_apply(lambda x: str(x.replace('"', "").strip(' ')))
country_data['Numeric code'] = country_data['Numeric code'].progress_apply(lambda x: int(x.replace('"', "").strip(' ')))
country_data['Latitude (average)'] = country_data['Latitude (average)'].progress_apply(lambda x: float(x.replace('"', "").strip(' ')))
country_data['Longitude (average)'] = country_data['Longitude (average)'].progress_apply(lambda x: float(x.replace('"', "").strip(' ')))

In [None]:
country_data['Country'] = country_data['Country'].astype(str)

country_data['country_code'] = country_data['Alpha-2 code'].astype(str)
country_data.drop(['Alpha-2 code'], axis = 1, inplace = True)
country_data['Alpha-3 code'] = country_data['Alpha-3 code'].astype(str)
country_data['Numeric code'] = country_data['Numeric code'].astype(int)
country_data['Latitude (average)'] = country_data['Latitude (average)'].astype(float)
country_data['Longitude (average)'] = country_data['Longitude (average)'].astype(float)
country_data.head()

In [None]:
n_tweets = data_loc.shape[0]
print('tweets with locations: ', n_tweets)

In [None]:
wrld_map = data_loc.groupby(['country_code']).size().to_frame(name = 'count').reset_index()
wrld_map.head()

In [None]:
wrld_map['percentage'] = wrld_map['count'].progress_apply(lambda x: (x/n_tweets))
wrld_map.drop(['count'], axis = 1, inplace = True)
wrld_map.head()

In [None]:
wrld_map = pd.merge(wrld_map, country_data, on='country_code')
wrld_map.head()

In [None]:
wrld_map['continent_code'] = wrld_map['country_code'].progress_apply(lambda x: get_continent_code(x))
wrld_map.head()

In [None]:
#empty map
world_map= folium.Map(tiles="cartodbpositron")
marker_cluster = MarkerCluster().add_to(world_map)

In [None]:
#for each coordinate, create circlemarker of user percent
for i in range(len(wrld_map)):
        lat = wrld_map.iloc[i]['Latitude (average)']
        long = wrld_map.iloc[i]['Longitude (average)']
        radius=5
        popup_text = """Country : {}<br>
                    %of Users : {}<br>"""
        popup_text = popup_text.format(wrld_map.iloc[i]['country_code'],
                                   wrld_map.iloc[i]['percentage']
                                   )
        folium.CircleMarker(location = [lat, long], radius=radius, popup= popup_text, fill =True).add_to(marker_cluster)
#show the map
world_map

In [None]:
data['user_name'].isna().sum()

***There are no records with empty user names.***

In [None]:
data['user_location'].isna().sum()

# *There are 1250 missing user locations*

# **Dropping user_locations feature as the number of missing columns are more than 34% of data**

# *We will drop user_descriptions as well, as they don't seem to contribute much.*

In [None]:
data['user_created'].isna().sum()

# *All the user creation dates seem to be present.*

In [None]:
data['user_followers'].isna().sum()

In [None]:
import plotly.express as px
fig = px.histogram(data, x="user_followers", range_x  = (0,500000))
fig.show()

# *User Followers seem fine.*

In [None]:
# Cleaning the tweets

def cleanUpTweet(txt):
    # Remove mentions
    txt = re.sub(r'@[A-Za-z0-9_]+', '', txt)
    # Remove hashtags
    txt = re.sub(r'#', '', txt)
    # Remove retweets:
    txt = re.sub(r'RT : ', '', txt)
    # Remove urls
    txt = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+', '', txt)
    return txt

In [None]:
data['text'] = data['text'].apply(cleanUpTweet)

*Determining Subjectivity and Polarity of text using TextBlob*

In [None]:
def getTextSubjectivity(txt):
    return TextBlob(txt).sentiment.subjectivity

def getTextPolarity(txt):
    return TextBlob(txt).sentiment.polarity

In [None]:
data['Subjectivity'] = data['text'].apply(getTextSubjectivity)
data['Polarity'] = data['text'].apply(getTextPolarity)

In [None]:
data.head()

In [None]:
# negative, nautral, positive analysis
def getTextAnalysis(a):
    if a < 0:
        return "Negative"
    elif a == 0:
        return "Neutral"
    else:
        return "Positive"

In [None]:
data['Sentiment'] = data['Polarity'].apply(getTextAnalysis)

In [None]:
data.head()

In [None]:
positive_tweets = data[data['Sentiment'] == 'Positive']

print(str(positive_tweets.shape[0]/(data.shape[0])*100) + " % of positive tweets")

In [None]:
labels = data.groupby('Sentiment').count().index.values

values = data.groupby('Sentiment').size().values

plt.bar(labels, values)

In [None]:
for index, row in data.iterrows():
    if row['Sentiment'] == 'Positive':
        plt.scatter(row['Polarity'], row['Subjectivity'], color="green")
    elif row['Sentiment'] == 'Negative':
        plt.scatter(row['Polarity'], row['Subjectivity'], color="red")
    elif row['Sentiment'] == 'Neutral':
        plt.scatter(row['Polarity'], row['Subjectivity'], color="blue")

plt.title('Vaccine Sentiment Analysis')
plt.xlabel('Polarity')
plt.ylabel('Subjectivity')
# add legend
plt.show()

*Initial Analysis using TextBlob shows more Positive tweets than Negative or Neutral*

In [None]:
data['Sentiment'].value_counts()

**Calculating Influence of Tweets by User Profile Size**

In [None]:
def tweet_influence(row):
    #print(row['text'])
    followers = row['user_followers']
    retweets = row['retweets']
    is_retweet = int(row['is_retweet'])
    #print(is_retweet)
    friends = row['user_friends']
    #print('tweet influence: ', ((followers + retweets)/pow(2, is_retweet)) + friends)
    tweet_influence = ((followers + retweets)/pow(2, is_retweet)) + friends
    return tweet_influence