In [7]:
# pip install google-api-python-client
from googleapiclient import discovery
import json
import pandas as pd 
import time
import re

In [8]:
API_KEY = 'AIzaSyC1WqnJqZ7-qZH-zAwuG24Y1ZpVRXxttLE'

client = discovery.build(
    "commentanalyzer",
    version = "v1alpha1",
    developerKey=API_KEY,
    discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
    static_discovery=False,
)

In [None]:
# rate the toxicity and profanity for a list of tweets --> returns dataframe of output values
def toxicity_score(list_of_tweets):
  k = 0 # number of analyzed tweets
  minimum_tweet_length = 10  
  length = len(list_of_tweets)
  # initialise output lists
  output = {'Toxicity':[], 'Severe_Toxicity':[], 'Identity_Attack':[], 'Insult':[], 'Profanity':[], 'Threat':[]}

  for i in range(length):
      # skip any tweets shorter than minimum_tweet_length
      if len(list_of_tweets[i]) < minimum_tweet_length:
           output['Toxicity'].append("-")
           output['Severe_Toxicity'].append("-")
           output['Identity_Attack'].append("-")
           output['Insult'].append("-")
           output['Profanity'].append("-")
           output['Threat'].append("-")
      else: ## ANALYZE:
          analyze_request = {'comment': {'text': list_of_tweets[i]},
                             'languages': ['en'],
                             'requestedAttributes': {'TOXICITY': {},
                                                     'SEVERE_TOXICITY': {},
                                                     'IDENTITY_ATTACK': {},
                                                     'INSULT': {},
                                                     'PROFANITY': {},
                                                     'THREAT': {}}}
          response = client.comments().analyze(body=analyze_request).execute()
          # print(json.dumps(response, indent=4))

          output['Toxicity'].append(response['attributeScores']['TOXICITY']['summaryScore']['value'])
          output['Severe_Toxicity'].append(response['attributeScores']['SEVERE_TOXICITY']['summaryScore']['value'])
          output['Identity_Attack'].append(response['attributeScores']['IDENTITY_ATTACK']['summaryScore']['value'])
          output['Insult'].append(response['attributeScores']['INSULT']['summaryScore']['value'])
          output['Profanity'].append(response['attributeScores']['PROFANITY']['summaryScore']['value'])
          output['Threat'].append(response['attributeScores']['THREAT']['summaryScore']['value'])

          time.sleep(1) # don't exceed 1 request per second 
          k = k + 1

      print(f'1. number of tweets looked at: {i}')
      print(f'2. number of tweets analyzed: {k} \n----------------------------')
  return pd.DataFrame(output)

In [None]:
##### Creating the Dataset of Elon Musk Tweets #####

##### Combining our two datasets #####

# dataframe of all musk tweets till 12.09.2022
tweet_df = pd.read_csv('musk_dataset/musk_big_dataset.csv')

# newer Musk Tweets from 13.09.2022 - 27.10.2022
new_tweet_df = pd.read_csv('musk_dataset/cleandata.csv')

# initialise lists
output = {'Date Created':[], 'Number of Likes':[], 'Source of Tweet':[], 'Tweets':[]}

for i in range(532):
    output['Date Created'].append(new_tweet_df['Date'][i])
    output['Number of Likes'].append(new_tweet_df['Likes'][i])
    output['Source of Tweet'].append("-")
    output['Tweets'].append(new_tweet_df['Tweets'][i])

new_data = pd.DataFrame(output)

data = pd.concat([new_data, tweet_df], 
                  keys = ['new_data', 'old_data'],
                  ignore_index = True)


##### Simplify our Dataset #####
# simplify the Date Created column in data
for i in range(data.shape[0]):
    data['Date Created'][i] = re.sub(r"\+00:00", "", data['Date Created'][i])

# delete the column 'Source of Tweet' 
del data['Source of Tweet']

In [12]:
##### Data Cleanup #####
def cleantwt (twt):
  emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)

  twt = re.sub('RT', '', twt)               # remove 'RT' from tweets
  twt = re.sub('#[A-Za-z0-9]+', '', twt)    # remove the '#' from the tweets
  twt = re.sub('\\n', '', twt)              # remove the '\n' character
  twt = re.sub('https?:\/\/\S+', '', twt)   # remove the hyperlinks
  twt = re.sub('@[\S]*', '', twt)           # remove @mentions
  twt = re.sub('^[\s]+|[\s]+$', '', twt)    # remove leading and trailing whitespaces
  twt = re.sub(emoj, '', twt)               # remove emojis
  return twt

## Cleanup the data
Cleaned_Tweets = []
for i in range(data.shape[0]):
    cleaned_tweet = cleantwt(data['Tweets'][i])
    Cleaned_Tweets.append(cleaned_tweet)

# -> Dataframe with added column "Cleaned_Tweets"
data["Cleaned Tweets"] = Cleaned_Tweets

print(data['Cleaned Tweets'])

0                                                   thanks
1                                               Absolutely
2                                 Dear Twitter Advertisers
3           Meeting a lot of cool people at Twitter today!
4                  Entering Twitter HQ – let that sink in!
                               ...                        
17964                    That was a total non sequitur btw
17965    Great Voltaire quote, arguably better than Twa...
17966    I made the volume on the Model S  go to 11.  N...
17967    Went to Iceland on Sat to ride bumper cars on ...
17968    Please ignore prior tweets, as that was someon...
Name: Cleaned Tweets, Length: 17969, dtype: object


In [19]:
##### Rating the Toxicity of Elon Musk Tweets #####  
tweet_list = list(data.head(100)['Cleaned Tweets'])

### rate the toxicity ### 
scores = toxicity_score(tweet_list)
print(scores)

# write the scores into the dataframe
data = pd.concat([data.head(100), scores], axis=1, join='inner')
print(data)
data.to_csv('Elon_Musk_Twitter_Toxicity.csv', float_format="%.10f")

1. number of tweets looked at: 0
2. number of tweets analyzed: 0 
----------------------------
1. number of tweets looked at: 1
2. number of tweets analyzed: 1 
----------------------------
1. number of tweets looked at: 2
2. number of tweets analyzed: 2 
----------------------------
1. number of tweets looked at: 3
2. number of tweets analyzed: 3 
----------------------------
1. number of tweets looked at: 4
2. number of tweets analyzed: 4 
----------------------------
1. number of tweets looked at: 5
2. number of tweets analyzed: 5 
----------------------------
1. number of tweets looked at: 6
2. number of tweets analyzed: 6 
----------------------------
1. number of tweets looked at: 7
2. number of tweets analyzed: 7 
----------------------------
1. number of tweets looked at: 8
2. number of tweets analyzed: 8 
----------------------------
1. number of tweets looked at: 9
2. number of tweets analyzed: 9 
----------------------------
1. number of tweets looked at: 10
2. number of twe

In [22]:
### Visualization of the most Toxic / Profane Tweets ###
data2 = pd.read_csv('Elon_Musk_Twitter_Toxicity.csv')
print(data2['Cleaned Tweets'])


max_tox = 1
max_prof = 1
for i in range(100):
    if (data2['Toxicity'][i] == "-"):
        i = i + 1
    else: 
        if (data2['Toxicity'][i] > data2['Toxicity'][max_tox]):
              max_tox = i
        if (data2['Profanity'][i] > data2['Profanity'][max_prof]):
              max_prof = i

print(f'Maximum Toxicity had Tweet Nr.  {max_tox}: {data2["Cleaned Tweets"][max_tox]} \t score: {data2["Toxicity"][max_tox]}')
print(f'Maximum Profanity had Tweet Nr. {max_prof}: {data2["Cleaned Tweets"][max_prof]} \t score: {data2["Profanity"][max_prof]}')

0                                                thanks
1                                            Absolutely
2                              Dear Twitter Advertisers
3        Meeting a lot of cool people at Twitter today!
4               Entering Twitter HQ – let that sink in!
                            ...                        
95    Will require truly exceptional execution, but ...
96     I will not let you down, no matter what it takes
97                                              Awesome
98    We even did a Starlink video call on one airpl...
99                                   Vox Populi Vox Dei
Name: Cleaned Tweets, Length: 100, dtype: object
Maximum Toxicity had Tweet Nr.  36: While it’s true that Kasparov is almost as good at playing chess as my iPhone, he is otherwise an idiot 	 score: 0.8115627
Maximum Profanity had Tweet Nr. 64: un ass ailable logic! 	 score: 0.5150164
