## 1. Install the twitter library in python

In [None]:
!pip install --user tweepy

## 2. Install library for JSON

In [None]:
!pip install --user simplejson

## 3. Install sentiment analysis library

In [None]:
# Whatever library you use
!pip install --user textblob
!python -m textblob.download_corpora lite

# Streaming tweets and perform some data analysis


### Setting up and running a streaming crawler

In [1]:
import tweepy
import simplejson as json
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
from textblob import TextBlob

In [None]:
#Complete with your keys 


 
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
 

class MyListener(StreamListener):
    
    def __init__(self, api=None):
        super(StreamListener, self).__init__()
        self.num_tweets = 0

    def on_data(self, data):
        try:
            with open('crawled_tweets.json', 'a', newline='') as f:
                # Filtering data only with info about the country
                if json.loads(data).get('place'): #checks that the attribute exists 
                    # if json.loads(data)['place']['country'] == target: select specific country = target
                    f.write(data) # This will store the whole JSON data in the file, you can perform some JSON filters
                    twitter_text = json.loads(data)['text'] # You can also print your tweets here
                    self.num_tweets += 1
                
                # Just to limit the number of tweets collected to check the 
                # program at the beginning, then increase the limit
                if self.num_tweets < 200: 
                    return True
                else:
                    return False
        except BaseException as e:
            print("Error on_data: %s" % str(e))
        return True

 
    def on_error(self, status):
        print('Error :', status.place)
        return False
    
twitter_stream = Stream(auth, MyListener())
twitter_stream.filter(track=["lgbt", "LGBT", "LGBTQ", "lgbtq", "lgbtq+", "LGBTQ+"]) # Add your keywords and other filters
#twitter_stream.filter(track=['Trump']) # Add your keywords and other
print('_______ End _______')

### Store the JSON data in a CSV for analysing

In [16]:
import simplejson as json

# Create the CSV file
with open ("data/LGBTQ_no_country.csv", 'w', encoding ='utf-8') as csv:
    # Write the title of the columns (features) that you want to store in the CSV file
    csv.write('id, created_at, followers, friends, favorite_count, verified, description, text\n')
    #country, followers, friends, text\n')
    
    # Copy the data from the JSON file
    with open('data/LGBTQ_no_country.json', 'r', encoding ='utf-8') as jsonfile:
        for tweet in jsonfile: 
            data = json.loads(tweet)
            text = str(data['text'].replace('\n', '').replace(",", ""))
            id_ = str(data['id'])
            created_at = str(data['created_at'])
            #country = str(data['place']['country'])
            followers = str(data['user']['followers_count'])
            friends = str(data['user']['friends_count'])
            favorites = str(data['favorite_count'])
            verified = str(data['user']['verified'])
            description = ''
            if(data['user']['description']):
                description = str(data['user']['description'].replace('\n', '').replace(",", ""))
            else: 
                description = ''
            line = id_+','+created_at+','+followers+','+friends+','+favorites+','+verified+','+description+','+text+'\n'
            #country+','+
            csv.write(line)

### Load the previous CSV into pandas

In [17]:
import pandas as pd
tweets = pd.read_csv('data/LGBTQ_no_country.csv', index_col=0, encoding='utf-8', sep=r'\s*,\s*', engine='python')
tweets.head(20)

Unnamed: 0_level_0,created_at,followers,friends,favorite_count,verified,description,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1218938315472195584,Sun Jan 19 16:48:35 +0000 2020,957.0,1561.0,0.0,False,Lesbian | Insignificant Neoconservative Ideolo...,Socialism benefits no one
1218938318135746562,Sun Jan 19 16:48:36 +0000 2020,14.0,131.0,0.0,False,🐶,@atahasnain53 Sir would you be willing to comm...
1218938324058103808,Sun Jan 19 16:48:37 +0000 2020,190.0,146.0,0.0,False,Just some bisexual idiot on the Internet. Slut...,@satiricole @AllianceLGB A transphobe was kick...
1218938329124810753,Sun Jan 19 16:48:39 +0000 2020,368.0,1253.0,0.0,False,,RT @OwenJones84: The 'LGB Alliance' is a hate ...
1218938330370519040,Sun Jan 19 16:48:39 +0000 2020,3829.0,4841.0,0.0,False,#Spoonie #NoStigma #NoH8 #LoveWins #RESIST #W...,RT @Jersey_Craig: The US Hasn't Only Stopped D...
1218938346665336834,Sun Jan 19 16:48:43 +0000 2020,88.0,246.0,0.0,False,,RT @LozzaFox: Nothing more #stunningandbrave t...
1218938349265850369,Sun Jan 19 16:48:43 +0000 2020,718.0,1405.0,0.0,False,,RT @BrandonStraka: #WalkAway is teaming up wit...
1218938351337660421,Sun Jan 19 16:48:44 +0000 2020,296.0,403.0,0.0,False,Come Back Soon Woojin 🐻💙,RT @kpophappenings_: when that kpop boy played...
1218938356140331009,Sun Jan 19 16:48:45 +0000 2020,156.0,119.0,0.0,False,23. Professional nerd. Amateur musician. Quest...,idk who he is but we stan hard 😤😤😤😤😤😤😤😤😤😤😤😤😤😤
1218938357918715906,Sun Jan 19 16:48:45 +0000 2020,403.0,349.0,0.0,False,Je débarque dans ta vie en claquettes secondai...,@CharlotteThuil3 Mes mains sont lgbtq


### Analysing the polarity of the tweets

In [None]:
avg_polar = 0.
avg_subj = 0.
count = 0

for sentence in tweets['text']:
    blob = TextBlob(sentence)
    count+=1

    polar = blob.sentiment[0] #polarity between -1, 1
    subj = blob.sentiment[1] #subjectivity
    
    avg_polar +=polar
    avg_subj +=subj
    
print("Number of tweets analysed: ", count)
print("Average polarity: ", avg_polar/count)
print("Average subjectivity: ", avg_subj/count)

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(sentence): 
    score = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(score)))
    
    return score

In [None]:
for index in tweets.index: 
    tweet = tweets.loc[index, 'text']

    # Detect language of tweet 
    tweets.loc[index, 'language'] = TextBlob(tweet).detect_language()

    # Calculate score and write into new columns of eisting data frame
    score = sentiment_analyzer_scores(tweet)
    tweets.loc[index, 'polarity_neg'] = score['neg']
    tweets.loc[index, 'polarity_neu'] = score['neu']
    tweets.loc[index, 'polarity_pos'] = score['pos']
    tweets.loc[index, 'polarity_compound'] = score['compound']

In [None]:
tweets.head()

### Generating a wordcloud

In [None]:
!pip install --user wordcloud

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
 
# Create a list of word
text = ','.join(tweets['text'].to_list())

# Create a stopword list
stopwords = set(STOPWORDS)
stopwords.update(["http", "https", "co", "lgbt", "LGBT", "LGBTQ", "lgbtq", "lgbtq+", "LGBTQ+", "RT"])

# Create the wordcloud object
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
 
# Display the generated image (matplotlib way):
plt.figure(figsize = (10, 15))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### Your own analysis

In [None]:
# Look at averages per country/region
# Wordcloud -> keywords that are very polarised
# Check if description contains journalist!

In [None]:
!pip install plotly
!pip install pycountry-convert
!pip install translate

In [None]:
import pycountry_convert as pc 
from googletrans import Translator

translate_urls = ["translate.google.com", "translate.google.co.kr",
                      "translate.google.at", "translate.google.de",
                      "translate.google.ru", "translate.google.ch",
                      "translate.google.fr", "translate.google.es"]

# Translate 
translator = Translator(service_urls=translate_urls)   
tweets['trans_country'] = [translator.translate(orig_country).text for orig_country in tweets['country']]

In [None]:
for index in tweets.index: 
    single_country = tweets.loc[index, 'country']
    #tweets.head()['iso_alpha_3']
    try:
        country_iso_alpha_3 = pc.country_name_to_country_alpha3(single_country, cn_name_format="default")
        tweets.loc[index, 'iso_alpha_3'] = country_iso_alpha_3
    except KeyError as e:
        print('KeyError - reason {}'.format(str(e)))
        tweets.loc[index, 'iso_alpha_3'] = ''
    except IndexError as e:
        print('I got an IndexError - reason {}'.format(str(e)))
    except TypeError as e:
        tweets.loc[index, 'iso_alpha_3'] = ''
        print('I got an TypeError - reason {}'.format(str(e)))

In [None]:
tweets.head()

In [None]:
import plotly.express as px

#df = px.data.gapminder().query("year==2007")

fig = px.choropleth(tweets.head(), 
                    locations="iso_alpha_3",
                    color="polarity_compound", # lifeExp is a column of gapminder
                    hover_name="country", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma)
fig.show()