In [1]:
import numpy as np 
import pandas as pd

In [2]:
df = pd.read_csv("Dataset/Corona_NLP_train.csv", encoding='ISO-8859-1')
print(df.head())


   UserName  ScreenName   Location     TweetAt  \
0      3799       48751     London  16-03-2020   
1      3800       48752         UK  16-03-2020   
2      3801       48753  Vagabonds  16-03-2020   
3      3802       48754        NaN  16-03-2020   
4      3803       48755        NaN  16-03-2020   

                                       OriginalTweet           Sentiment  
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...             Neutral  
1  advice Talk to your neighbours family to excha...            Positive  
2  Coronavirus Australia: Woolworths to give elde...            Positive  
3  My food stock is not the only one which is emp...            Positive  
4  Me, ready to go at supermarket during the #COV...  Extremely Negative  


In [3]:
df.columns

Index(['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet',
       'Sentiment'],
      dtype='object')

In [4]:
df[["OriginalTweet", "Sentiment"]]

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...
41152,Airline pilots offering to stock supermarket s...,Neutral
41153,Response to complaint not provided citing COVI...,Extremely Negative
41154,You know itÂs getting tough when @KameronWild...,Positive
41155,Is it wrong that the smell of hand sanitizer i...,Neutral


In [5]:
df.drop(['UserName', 'ScreenName','Location', 'TweetAt'], axis=1, inplace=True)

In [6]:
df

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...
41152,Airline pilots offering to stock supermarket s...,Neutral
41153,Response to complaint not provided citing COVI...,Extremely Negative
41154,You know itÂs getting tough when @KameronWild...,Positive
41155,Is it wrong that the smell of hand sanitizer i...,Neutral


In [7]:
df.isnull().sum()

OriginalTweet    0
Sentiment        0
dtype: int64

In [8]:
import re
import string

def clean_tweet(text):
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove URLs (http, https, www)
    text = re.sub(r"http\S+|www\S+", "", text)
    
    # 3. Remove mentions (@username)
    text = re.sub(r"@\w+", "", text)
    
    # 4. Remove hashtags (keep the word if you want)
    text = re.sub(r"#\w+", "", text)
    
    # 5. Remove emojis and other non-alphanumeric symbols
    text = re.sub(r"[^\w\s" + string.punctuation + "]", "", text)
    
    # 6. Remove unnecessary punctuation (keep only . , ! ?)
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    
    # 7. Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    return text


In [9]:
df["OriginalTweet"] = df["OriginalTweet"].apply(clean_tweet)
df["OriginalTweet"] = df["OriginalTweet"].apply(lambda x: x.split())
label_mapping = {
    "Extremely Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4
}
df["Sentiment"] = df["Sentiment"].map(label_mapping)

df


Unnamed: 0,OriginalTweet,Sentiment
0,"[and, and]",2
1,"[advice, talk, to, your, neighbours, family, t...",3
2,"[coronavirus, australia, woolworths, to, give,...",3
3,"[my, food, stock, is, not, the, only, one, whi...",3
4,"[me, ready, to, go, at, supermarket, during, t...",0
...,...,...
41152,"[airline, pilots, offering, to, stock, superma...",2
41153,"[response, to, complaint, not, provided, citin...",0
41154,"[you, know, its, getting, tough, when, is, rat...",3
41155,"[is, it, wrong, that, the, smell, of, hand, sa...",2


In [10]:
all_words = [word for tokens in df["OriginalTweet"] for word in tokens]
vocab = set(all_words)
vocab_size = len(vocab)
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 36864


In [11]:
from collections import Counter
import random

# all_words = [word for tokens in df["Origi"] for word in tokens]
word_freq = Counter(all_words)

# minimum frequency value
min_freq = min(word_freq.values())

# words having this frequency
least_freq_words = [word for word, freq in word_freq.items() if freq == min_freq]
random.shuffle(least_freq_words)

print("Minimum Frequency:", min_freq)
print("Words with Minimum Frequency:", least_freq_words)


Minimum Frequency: 1


In [12]:
def replace_least_freq_words(word_list, least_freq_words, max_replacements=500):
    replacements = 0
    result = []
    for word in word_list:
        if word in least_freq_words and replacements < max_replacements:
            result.append('UNK')
            replacements += 1
        else:
            result.append(word)
    return result

# Assuming least_freq_words is defined somewhere earlier, e.g.:
# least_freq_words = set([...])

# Now apply the function to the DataFrame column, passing least_freq_words as argument:
df['OriginalTweet'] = df['OriginalTweet'].apply(lambda x: replace_least_freq_words(x, least_freq_words))



In [13]:
print("Words with Minimum Frequency:", least_freq_words)



In [14]:
df

Unnamed: 0,OriginalTweet,Sentiment
0,"[and, and]",2
1,"[advice, talk, to, your, neighbours, family, t...",3
2,"[coronavirus, australia, woolworths, to, give,...",3
3,"[my, food, stock, is, not, the, only, one, whi...",3
4,"[me, ready, to, go, at, supermarket, during, t...",0
...,...,...
41152,"[airline, pilots, offering, to, stock, superma...",2
41153,"[response, to, complaint, not, provided, citin...",0
41154,"[you, know, its, getting, tough, when, is, rat...",3
41155,"[is, it, wrong, that, the, smell, of, hand, sa...",2


In [15]:
all_w = [word for tokens in df["OriginalTweet"] for word in tokens]
vo = set(all_w)
vocab_sz = len(vo)
print("Vocabulary Size:", vocab_sz)

from collections import Counter

# all_words = [word for tokens in df["Origi"] for word in tokens]
word_f = Counter(all_words)

# minimum frequency value
min_f = min(word_freq.values())

# words having this frequency
least = [word for word, freq in word_freq.items() if freq == min_freq]

print("Minimum Frequency:", min_f)
print("Words with Minimum Frequency:", least)


Vocabulary Size: 17939
Minimum Frequency: 1


In [16]:
print(len(least))

18926


In [17]:
all_words = [word for tokens in df["OriginalTweet"] for word in tokens]
vocab = set(all_words)
vocab_size = len(vocab)
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 17939
