In [74]:
import snscrape.modules.twitter as sntwitter
from sklearn.model_selection import train_test_split
import pandas as pd
import re

# Machine Learning Project Phase 1
***
### Submitted by: 
Saad Sher Alam (24100161)

### Readme: 
- Scraping Libarary: [snscrape](https://github.com/JustAnotherArchivist/snscrape)

## Part 1

In [4]:
# Scraping tweets 

# list of tweets
tweet_data = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:SpursOfficial').get_items()):
    if i>999:
        break
    tweet_data.append([tweet.rawContent])

In [7]:
from csv import writer 

# Adding column name 'tweets' to csv 
with open('SpursOfficial_part1.csv', 'a') as file: 
    wr = writer(file)
    wr.writerows([['tweets']]) 
    file.close()

# Writing all tweets to csv 
with open('SpursOfficial_part1.csv', 'a') as file: 
    wr = writer(file)
    wr.writerows(tweet_data)
    file.close() 


In [96]:
# Checking csv
df = pd.read_csv("./SpursOfficial_part1.csv")
df.head(10)

Unnamed: 0,tweets
0,📆 The U18s have been drawn at home against QPR...
1,🦸‍♂️ https://t.co/uKl0v69QnP
2,The Club has successfully delivered its first ...
3,Our Black Friday sale has begun... 🙌\n\nGet up...
4,"It's here 🤩\n\nBroxbourne and Beyond, the stor..."
5,Wishing a very happy birthday to Michael Dawso...
6,Club Ambassador @LedleyKing visited local hosp...
7,Pierre is ready 👊 https://t.co/4EFnJY7IS1
8,🦁 🦁\n\nWorld Cup mode 🔛 https://t.co/n7mUjjoZf5
9,📸 @Caspar_Lee x @SpursStadium https://t.co/TZc...


### Part 2

<b>Cleaning:</b> <br>
- Convert tweets to lowercase 
- Remove punctuation 
- Remove numbers 
- Remove single characters
- Remove \n
- Remove stop words (from Assignment 2)
- Remove URLs

In [97]:
# Getting stop words 
stop_words = {} 
f = open('./stop_words.txt', 'r')
for x in f:
    stop_words[(x.split("\n")[0])] = 1

In [99]:
def remove_stop_words(review):
    cleaned_review = []
    words = review.split(" ")
    for word in words: 
        if word not in stop_words.keys(): 
            cleaned_review.append(word)
    return cleaned_review

def clean_tweets(dataset): 
    cleaned_tweets = [] 
    tweets = dataset.values
    for tweet in tweets: 
        tweet = tweet[0]
        # Converting to lowercase 
        clean = tweet.lower() 

        # Remove punctuation, numbers, and single characters 
        clean = re.sub(r'[^\w\s\@\#]', '', clean)
        clean = re.sub(r'\d', '', clean)
        clean = re.sub(r'\b[A-Z]\b', '', clean)

        # Removing /n 
        clean = clean.replace("\n", " ")

        # Removing stop words
        word_list = remove_stop_words(clean) 
        clean = " ".join(word_list)

        # Remove URLs (from assignment 0)
        clean = re.sub(r"https\S+", "", clean)
        
        cleaned_tweets.append([clean])
    return cleaned_tweets

cleaned_tweets = clean_tweets(df)

In [100]:
# Adding column name 'tweets' to csv 
with open('SpursOfficial_part2.csv', 'a') as file: 
    wr = writer(file)
    wr.writerows([['tweets']]) 
    file.close()

# Writing all cleaned tweets to csv 
with open('SpursOfficial_part2.csv', 'a') as file: 
    wr = writer(file)
    wr.writerows(cleaned_tweets)
    file.close() 


In [102]:
# Loading cleaned df
cleaned_df = pd.read_csv('SpursOfficial_part2.csv')
cleaned_df.head(10)

Unnamed: 0,tweets
0,us drawn home qpr first fa youth cup tie seas...
1,club successfully delivered first ever west en...
2,black friday sale begun get online spurs shop
3,broxbourne beyond story @spurswomen availabl...
4,wishing happy birthday michael dawson
5,club ambassador @ledleyking visited local hosp...
6,pierre ready
7,world cup mode
8,@caspar_lee x @spursstadium
9,latest premier league broadcast selection pro...


### Part 3


In [103]:
# Splitting Data 
train, test = train_test_split(cleaned_df, test_size=0.2, random_state=42)
train.head()

Unnamed: 0,tweets
1763,together
1117,fought hard visitors take points
1896,proud happy done past year part two catchup ...
670,working hard frankfurt
1681,north london derby


In [108]:
def construct_vocabulary(dataframe):
    vocabulary = [] 
    all_tweets = dataframe.values
    for tweet in all_tweets: 
        tweet = str(tweet[0])
        tweet_words = tweet.split()
        for tweet_word in tweet_words: 
            if tweet_word not in vocabulary: 
                vocabulary.append(tweet_word)
    return vocabulary

def get_bag_of_words(tweet, vocabulary): 
    bag_of_words = {} 
    for word in vocabulary: 
        bag_of_words[word] = 0 
    
    tweet_words = tweet.split() 
    for word in tweet_words: 
        if word in bag_of_words.keys(): 
            bag_of_words[word] += 1 

    # Laplace smoothing  
    # Add 1 to count of training vocabulary 
    # alpha = 1
    for key in bag_of_words.keys(): 
        bag_of_words[key] += 1

    return bag_of_words


In [109]:
vocab  = construct_vocabulary(train)
print("Ambient Dimensionality:",  len(vocab))
print(vocab)

Ambient Dimensionality: 1753
['together', 'fought', 'hard', 'visitors', 'take', 'points', 'proud', 'happy', 'done', 'past', 'year', 'part', 'two', 'catchup', 'fabio', 'paratici', 'working', 'frankfurt', 'north', 'london', 'derby', 'harry', 'kane', 'misses', 'spot', '#totsge', 'williams', 'makes', '#totmar', 'minutes', 'play', 'still', 'battling', 'opener', 'keep', 'fighting', 'lads', '#sgetot', 'three', 'onto', 'next', 'one', 'come', 'spurs', 'forced', 'change', '#toteve', 'underway', 'heres', 'lineup', 'todays', 'u', 'match', '@fulhamfc', 'ko', 'uk', 'good', 'luck', 'doubt', 'wishing', 'best', 'recovery', 'sonny', 'th', 'birthday', '@iamjermaindefoe', 'appearances', 'goals', 'big', 'secured', 'road', 'highlights', 'brighton', 'team', 'news', 'running', 'floats', 'beautiful', 'ball', 'box', 'guides', 'keeper', '#bhatot', '#ucl', 'matchday', 'olympique', 'de', 'marseille', '@spursstadium', 'uefa', 'champions', 'league', 'pm', '#', 'pulls', 'back', 'black', 'friday', 'sale', 'begun', 'ge

In [110]:
# Bag of words for training data 
train_tweets = train.values
train_bag_of_words = [] 
for tweet in train_tweets: 
    tweet = str(tweet[0])
    bag_of_words = get_bag_of_words(tweet, vocab)
    train_bag_of_words.append(bag_of_words)

for i in range(10): 
    print(train_tweets[i])
    print(list(train_bag_of_words[i].values()))


['together  ']
[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [111]:
# Bag of words for test data 
test_tweets = test.values
test_bag_of_words = [] 
for tweet in test_tweets: 
    tweet = str(tweet[0])
    bag_of_words = get_bag_of_words(tweet, vocab)
    test_bag_of_words.append(bag_of_words)

for i in range(10): 
    print(test_tweets[i])
    print(list(test_bag_of_words[i].values()))


['well never get tired watching   #totliv ']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 