Mount the google drive containing:
- twitter.properties file : contains the twitter connection information

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install ConfigParser



Configparser is a library that is used to read raw or formatted config files (available in key:value format)

In [None]:
import configparser
config = configparser.RawConfigParser()
config.read('/content/drive/My Drive/Colab Notebooks/twitter.properties')

print(config.sections())

['twitter']


In [None]:
import os
import tweepy as tw

In [None]:
#user access tokens
accesstoken = config.get('twitter','accesstoken')
accesstokensecret = config.get('twitter','accesstokensecret')
#Consumer API keys
apikey = config.get('twitter','apikey')
apisecretkey = config.get('twitter','apisecretkey')

auth = tw.OAuthHandler(apikey, apisecretkey)
auth.set_access_token(accesstoken, accesstokensecret)
api = tw.API(auth, wait_on_rate_limit=True)

In [None]:
search_words = '#coronavirus' #list of keywords to search by. Want to try a combination of queries/keywords? Refer here: https://developer.twitter.com/en/docs/twitter-api/v1/tweets/search/guides/standard-operators
date_since = '2021-01-01' #yyyy-mm-dd format

In [None]:
tweets = tw.Cursor(
    api.search,
    q= search_words,
    lang = 'en',
    since = date_since
).items(100)

In [None]:
import pandas as pd
#pd.set_option('max_colwidth', 800)
tweet_details = [[tweet.geo,tweet.text,tweet.user.screen_name,tweet.user.location] for tweet in tweets]
tweet_df = pd.DataFrame(data = tweet_details, columns = ['geo','text','user','location'])
tweet_df.head(10)

Unnamed: 0,geo,text,user,location
0,,"RT @UR_Ninja: Molly Wentzel, Steve Lahr, Larvi...",esolomon2,"Naples, FL"
1,,Nothing is too difficult for God. Let's join w...,themerry_monk,England.
2,,RT @ndmaindia: #COVID19 | While travelling in ...,dr_avtar_singh,"Ramnagar, Jammu And Kashmir"
3,,Airbnb CEO says travel never going back to the...,MusafirNamah,"New York, US & Delhi, India"
4,,RT @WHO: Preliminary investigations conducted ...,chipcamel,Varrock
5,,https://t.co/fUaADePSNt\n\nThis political sati...,ForNaturism,
6,,RT @WHO: Preliminary investigations conducted ...,PramodAgni1,"New Delhi, India"
7,,"RT @UR_Ninja: Molly Wentzel, Steve Lahr, Larvi...",SchmoopyWoopy,"""Old"" Arizona, USA"
8,,Editorial: Prevent infections and avoid confus...,The_Japan_News,"Tokyo, Japan"
9,,RT @scotgov: 📺 Watch live: First Minister Nico...,lauravennard1,"Edinburgh, Scotland"


In [None]:
tweet_df.location.value_counts()

                               26
New York, US & Delhi, India     3
canada                          3
Perth, Western Australia        2
Australia                       2
                               ..
Taipei City, Taiwan             1
Varrock                         1
Ampang                          1
Martian colony                  1
England.                        1
Name: location, Length: 67, dtype: int64

Data cleaning: tweet text

In [None]:
import re
def clean_tweets(text):
    text = re.sub("RT @[\w]*:","",text)
    text = re.sub("@[\w]*","",text)
    text = re.sub("https?://[A-Za-z0-9./]*","",text)
    text = re.sub("\n","",text)
    return text

tweet_df['text'] = tweet_df['text'].apply(lambda x: clean_tweets(x))
tweet_df.head()

Unnamed: 0,geo,text,user,location
0,,"Molly Wentzel, Steve Lahr, Larvita McFarquhar...",esolomon2,"Naples, FL"
1,,Nothing is too difficult for God. Let's join w...,themerry_monk,England.
2,,#COVID19 | While travelling in #publictranspo...,dr_avtar_singh,"Ramnagar, Jammu And Kashmir"
3,,Airbnb CEO says travel never going back to the...,MusafirNamah,"New York, US & Delhi, India"
4,,Preliminary investigations conducted by the C...,chipcamel,Varrock


In [None]:
tweet_df.to_csv('tweets.csv')

##Data Transformation
Adding new features

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
tweet_df['entities'] = tweet_df['text'].apply(lambda x: ['\tText: {}, Entity: {}'.format(ent.text, ent.label_) if (not ent.text.startswith('#')) else '' for ent in nlp(x).ents])

In [None]:
tweet_df.head()

Unnamed: 0,geo,text,user,location,entities
0,,"Molly Wentzel, Steve Lahr, Larvita McFarquhar...",esolomon2,"Naples, FL","[\tText: Molly Wentzel, Entity: PERSON, \tText..."
1,,Nothing is too difficult for God. Let's join w...,themerry_monk,England.,[]
2,,#COVID19 | While travelling in #publictranspo...,dr_avtar_singh,"Ramnagar, Jammu And Kashmir","[\tText: StaySafe, Entity: MONEY, ]"
3,,Airbnb CEO says travel never going back to the...,MusafirNamah,"New York, US & Delhi, India",[]
4,,Preliminary investigations conducted by the C...,chipcamel,Varrock,"[\tText: Chinese, Entity: NORP]"


In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




True

vader package is an inbuilt package in nltk for sentiment analysis and generates polarity based on sentiment of text

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
tweet_df['sentiment']=tweet_df['text'].apply(lambda x: sid.polarity_scores(x))

tweet_df.head(5)

Unnamed: 0,geo,text,user,location,entities,sentiment
0,,"Molly Wentzel, Steve Lahr, Larvita McFarquhar...",esolomon2,"Naples, FL","[\tText: Molly Wentzel, Entity: PERSON, \tText...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,,Nothing is too difficult for God. Let's join w...,themerry_monk,England.,[],"{'neg': 0.0, 'neu': 0.597, 'pos': 0.403, 'comp..."
2,,#COVID19 | While travelling in #publictranspo...,dr_avtar_singh,"Ramnagar, Jammu And Kashmir","[\tText: StaySafe, Entity: MONEY, ]","{'neg': 0.0, 'neu': 0.874, 'pos': 0.126, 'comp..."
3,,Airbnb CEO says travel never going back to the...,MusafirNamah,"New York, US & Delhi, India",[],"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,,Preliminary investigations conducted by the C...,chipcamel,Varrock,"[\tText: Chinese, Entity: NORP]","{'neg': 0.111, 'neu': 0.758, 'pos': 0.131, 'co..."


In [None]:
!pip install googlemaps

Collecting googlemaps
  Downloading https://files.pythonhosted.org/packages/00/fa/508909813a3f0ff969d341695ee0b90cb0e954b4b536f17f15cc19b5c304/googlemaps-4.4.2.tar.gz
Building wheels for collected packages: googlemaps
  Building wheel for googlemaps (setup.py) ... [?25l[?25hdone
  Created wheel for googlemaps: filename=googlemaps-4.4.2-cp36-none-any.whl size=37858 sha256=1825a59afea45f7342bebc8df9b5101973a6c44a1568107fb961d0d0dfe94e0c
  Stored in directory: /root/.cache/pip/wheels/f4/21/41/0c84572e21d52bb322f6c299f38ac7cd8ad6d4d6ce23dc3631
Successfully built googlemaps
Installing collected packages: googlemaps
Successfully installed googlemaps-4.4.2


In [None]:
import googlemaps

In [None]:
gmaps = googlemaps.Client(key=config.get('twitter', 'googleapikey'))

geocode_result = gmaps.geocode(tweet_df['location'][0])

print(geocode_result)
print(geocode_result[0]['formatted_address'])

print(geocode_result[0]['formatted_address'].split(",")[-1].strip())

[{'address_components': [{'long_name': 'Naples', 'short_name': 'Naples', 'types': ['locality', 'political']}, {'long_name': 'Collier County', 'short_name': 'Collier County', 'types': ['administrative_area_level_2', 'political']}, {'long_name': 'Florida', 'short_name': 'FL', 'types': ['administrative_area_level_1', 'political']}, {'long_name': 'United States', 'short_name': 'US', 'types': ['country', 'political']}], 'formatted_address': 'Naples, FL, USA', 'geometry': {'bounds': {'northeast': {'lat': 26.211242, 'lng': -81.766661}, 'southwest': {'lat': 26.078572, 'lng': -81.8224189}}, 'location': {'lat': 26.1420358, 'lng': -81.7948103}, 'location_type': 'APPROXIMATE', 'viewport': {'northeast': {'lat': 26.211242, 'lng': -81.766661}, 'southwest': {'lat': 26.078572, 'lng': -81.8224189}}}, 'place_id': 'ChIJrdfCc5vh2ogRqpos1xhTP2c', 'types': ['locality', 'political']}]
Naples, FL, USA
USA


In [None]:
def get_country(input):
  try:
    output=gmaps.geocode(input)[0]['formatted_address'].split(",")[-1].strip()
  except:
    output="Error"
  return output

tweet_df['country'] = tweet_df['location'].apply(lambda x: "" if (not x.strip()) else get_country(x))
tweet_df.head()

Unnamed: 0,geo,text,user,location,entities,sentiment,country
0,,"Molly Wentzel, Steve Lahr, Larvita McFarquhar...",esolomon2,"Naples, FL","[\tText: Molly Wentzel, Entity: PERSON, \tText...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",USA
1,,Nothing is too difficult for God. Let's join w...,themerry_monk,England.,[],"{'neg': 0.0, 'neu': 0.597, 'pos': 0.403, 'comp...",UK
2,,#COVID19 | While travelling in #publictranspo...,dr_avtar_singh,"Ramnagar, Jammu And Kashmir","[\tText: StaySafe, Entity: MONEY, ]","{'neg': 0.0, 'neu': 0.874, 'pos': 0.126, 'comp...",Ramnagar 182122
3,,Airbnb CEO says travel never going back to the...,MusafirNamah,"New York, US & Delhi, India",[],"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",Error
4,,Preliminary investigations conducted by the C...,chipcamel,Varrock,"[\tText: Chinese, Entity: NORP]","{'neg': 0.111, 'neu': 0.758, 'pos': 0.131, 'co...",Error


In [None]:
tweet_df['country'].value_counts()

USA                                             34
                                                26
Error                                           11
Canada                                           6
India                                            4
Australia                                        4
UK                                               4
Pakistan                                         2
Ramnagar 182122                                  1
Taiwan 406                                       1
Thailand                                         1
44050 Fawn Creek Township/Battalgazi/Malatya     1
Malaysia                                         1
Taiwan                                           1
Japan                                            1
Bahrain                                          1
Indonesia                                        1
Name: country, dtype: int64

In [None]:
tweet_df.to_csv('tweets.csv')