In [14]:
from dotenv import dotenv_values
import os
import requests
import json
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word

In [6]:
config = dotenv_values("C:\\Users\\justi\\OneDrive\\Documents\\MSc Data Analytics\\Semester 1 - CA 2\\.env")

In [9]:
# Set the search query and the number of tweets to retrieve
query = "potatoes"
count = 100

# Set the API endpoint and the Bearer Token for authenticating the request
endpoint = "https://api.twitter.com/2/tweets/search/recent"
bearer_token = config["BEARER_TOKEN"]

# Set the headers for the request
headers = {
    "Authorization": f"Bearer {bearer_token}",
    "User-Agent": "v2RecentSearchPython",
}

# Set the parameters for the request
params = {
    "query": query,
    "max_results": count,
    "expansions": "author_id,referenced_tweets.id",
    "user.fields": "username",
    "tweet.fields": "public_metrics"
}

# Make the request to the API endpoint
response = requests.get(endpoint, headers=headers, params=params)

tweet_dicts = []

# Check the status code of the response
if response.status_code == 200:
    # If the request was successful, parse the JSON response
    data = response.json()

    # Print the tweets
    for tweet in data["data"]:
        tweet_dict = {
            "id": tweet["id"],
            "like_count": tweet["public_metrics"]["like_count"],
            "quote_count": tweet["public_metrics"]["quote_count"],
            "retweet_count": tweet["public_metrics"]["retweet_count"],
            "text": tweet["text"]
        }
        tweet_dicts.append(tweet_dict)
else:
    # If the request was not successful, print the error message
    print(f"Error: {response.status_code}")

In [12]:
tweets_df = pd.DataFrame(tweet_dicts)

In [13]:
tweets_df.head()

Unnamed: 0,id,like_count,quote_count,retweet_count,text
0,1608843281768808448,0,0,0,Today's lunch specials at the Catawba Deli inc...
1,1608843248776413185,0,0,3135,RT @Cobratate: I got beans greens potatoes tom...
2,1608843235346243589,0,0,29,RT @30vocamusic: becoming potatoes - neru http...
3,1608843229591633920,0,0,0,"Now greens, beans, potatoes, and everything el..."
4,1608843130538962945,0,0,503,RT @catturd2: Been cooking this all day - my f...


In [15]:
# I've save the dataframe as a csv file so I can reproduce my analysis with the same tweets
#tweets_df.to_csv('tweets_df.csv')

In [None]:
# Before doing the sentiment analysis, I need to process the text to make a clearer signal

In [16]:
# First of all I'll make all words lower case

tweets_df['text'] = tweets_df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
tweets_df['text'].head()

0    today's lunch specials at the catawba deli inc...
1    rt @cobratate: i got beans greens potatoes tom...
2    rt @30vocamusic: becoming potatoes - neru http...
3    now greens, beans, potatoes, and everything el...
4    rt @catturd2: been cooking this all day - my f...
Name: text, dtype: object

In [17]:
# Next I'll remove twitter handles seen as these have nothing to do with the sentiment of the tweet

tweets_df['text'] = tweets_df['text'].str.replace('@\w+', '')
tweets_df.head()

  tweets_df['text'] = tweets_df['text'].str.replace('@\w+', '')


Unnamed: 0,id,like_count,quote_count,retweet_count,text
0,1608843281768808448,0,0,0,today's lunch specials at the catawba deli inc...
1,1608843248776413185,0,0,3135,rt : i got beans greens potatoes tomatoes lamb...
2,1608843235346243589,0,0,29,rt : becoming potatoes - neru https://t.co/3gy...
3,1608843229591633920,0,0,0,"now greens, beans, potatoes, and everything el..."
4,1608843130538962945,0,0,503,rt : been cooking this all day - my famous cat...


In [18]:
# Next I'll remove punctuation

tweets_df['text'] = tweets_df['text'].str.replace('[^\w\s]', '')
tweets_df['text'].head()

  tweets_df['text'] = tweets_df['text'].str.replace('[^\w\s]', '')


In [20]:
# Next I'll remove stop words, ie commonly occurring words

stop = stopwords.words('english')

tweets_df['text'] = tweets_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
tweets_df['text'].head()

In [22]:
# I'll next look at frequently occurring words, some of which might not be relevant for sentiment analysis
# I'll remove rt, ie retweet because it's not relevant for sentiment

freq = pd.Series(' '.join(tweets_df['text']).split()).value_counts()[:10]
print(freq)

tweets_df['text'] = tweets_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq.index[1]))

tweets_df['text'].head()

potatoes    95
rt          43
name        22
like        21
tomatoes    20
sweet       20
day         20
chicken     17
think       13
dogs        12
dtype: int64


0    todays lunch specials catawba deli include 2 c...
1    got beans greens potatoes tomatoes lambs rams ...
2            becoming potatoes neru httpstco3gyi1s8lt5
3    greens beans potatoes everything else saybut s...
4    cooking day famous cattle trail stew basically...
Name: text, dtype: object

In [32]:
# Next I'll consider infrequent words
# I only have a 100 tweets so I won't remove infrequent words just because they're infrequent
# However, I will remove the words which start with https since these will have nothing to do with sentiment

freq = pd.Series(' '.join(tweets_df['text']).split()).value_counts()[-10:]
print(freq)

words_remove = list(freq.index[freq.index.str.startswith('https')])

tweets_df['text'] = tweets_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in words_remove))

tweets_df['text'].head()

match                 1
perfect               1
say                   1
mind                  1
boggles               1
httpstco7ok9poa3qs    1
httpstcouuafagzcyg    1
httpstcolfbvjgquin    1
love                  1
individually          1
dtype: int64


0    todays lunch specials catawba deli include 2 c...
1    got beans greens potatoes tomatoes lambs rams ...
2            becoming potatoes neru httpstco3gyi1s8lt5
3    greens beans potatoes everything else saybut s...
4    cooking day famous cattle trail stew basically...
Name: text, dtype: object

In [34]:
# Next I'll correct spelling mistakes

tweets_df['text'] = tweets_df['text'].apply(lambda x: str(TextBlob(x).correct()))

tweets_df['text'].head()

0    today lunch special catawba delhi include 2 ch...
1    got beans green potatoes potatoes lambs rams h...
2            becoming potatoes peru httpstco3gyi1s8lt5
3    green beans potatoes everything else salut soo...
4    cooking day famous cattle trail stew basically...
Name: text, dtype: object

In [35]:
# Finally I'll lematize the words

tweets_df['text'] = tweets_df['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

tweets_df['text'].head()

0    today lunch special catawba delhi include 2 ch...
1    got bean green potato potato lamb ram hog dog ...
2              becoming potato peru httpstco3gyi1s8lt5
3    green bean potato everything else salut soon m...
4    cooking day famous cattle trail stew basically...
Name: text, dtype: object

In [36]:
# Now I'll produce the sentiment analysis

tweets_df['sentiment'] = tweets_df['text'].apply(lambda x: TextBlob(x).sentiment[0])

tweets_df[['text', 'sentiment']].head()

Unnamed: 0,text,sentiment
0,today lunch special catawba delhi include 2 ch...,-0.060714
1,got bean green potato potato lamb ram hog dog ...,-0.4
2,becoming potato peru httpstco3gyi1s8lt5,0.45
3,green bean potato everything else salut soon m...,-0.25625
4,cooking day famous cattle trail stew basically...,0.5


In [37]:
tweets_df['sentiment'].mean()

0.08580643939393935