In [2]:
import numpy as np 
import pandas as pd

In [3]:
df = pd.read_csv("Dataset/Corona_NLP_train.csv", encoding='ISO-8859-1')
print(df.head())


   UserName  ScreenName   Location     TweetAt  \
0      3799       48751     London  16-03-2020   
1      3800       48752         UK  16-03-2020   
2      3801       48753  Vagabonds  16-03-2020   
3      3802       48754        NaN  16-03-2020   
4      3803       48755        NaN  16-03-2020   

                                       OriginalTweet           Sentiment  
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...             Neutral  
1  advice Talk to your neighbours family to excha...            Positive  
2  Coronavirus Australia: Woolworths to give elde...            Positive  
3  My food stock is not the only one which is emp...            Positive  
4  Me, ready to go at supermarket during the #COV...  Extremely Negative  


In [4]:
df.columns

Index(['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet',
       'Sentiment'],
      dtype='object')

In [5]:
df[["OriginalTweet", "Sentiment"]]

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...
41152,Airline pilots offering to stock supermarket s...,Neutral
41153,Response to complaint not provided citing COVI...,Extremely Negative
41154,You know itÂs getting tough when @KameronWild...,Positive
41155,Is it wrong that the smell of hand sanitizer i...,Neutral


In [6]:
df.drop(['UserName', 'ScreenName','Location', 'TweetAt'], axis=1, inplace=True)

In [7]:
df

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...
41152,Airline pilots offering to stock supermarket s...,Neutral
41153,Response to complaint not provided citing COVI...,Extremely Negative
41154,You know itÂs getting tough when @KameronWild...,Positive
41155,Is it wrong that the smell of hand sanitizer i...,Neutral


In [8]:
df.isnull().sum()

OriginalTweet    0
Sentiment        0
dtype: int64

In [9]:
import re
import string

def clean_tweet(text):
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove URLs (http, https, www)
    text = re.sub(r"http\S+|www\S+", "", text)
    
    # 3. Remove mentions (@username)
    text = re.sub(r"@\w+", "", text)
    
    # 4. Remove hashtags (keep the word if you want)
    text = re.sub(r"#\w+", "", text)
    
    # 5. Remove emojis and other non-alphanumeric symbols
    text = re.sub(r"[^\w\s" + string.punctuation + "]", "", text)
    
    # 6. Remove unnecessary punctuation (keep only . , ! ?)
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    
    # 7. Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    return text


In [10]:
df["OriginalTweet"] = df["OriginalTweet"].apply(clean_tweet)
df["OriginalTweet"] = df["OriginalTweet"].apply(lambda x: x.split())
label_mapping = {
    "Extremely Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4
}
df["Sentiment"] = df["Sentiment"].map(label_mapping)

df


Unnamed: 0,OriginalTweet,Sentiment
0,"[and, and]",2
1,"[advice, talk, to, your, neighbours, family, t...",3
2,"[coronavirus, australia, woolworths, to, give,...",3
3,"[my, food, stock, is, not, the, only, one, whi...",3
4,"[me, ready, to, go, at, supermarket, during, t...",0
...,...,...
41152,"[airline, pilots, offering, to, stock, superma...",2
41153,"[response, to, complaint, not, provided, citin...",0
41154,"[you, know, its, getting, tough, when, is, rat...",3
41155,"[is, it, wrong, that, the, smell, of, hand, sa...",2


In [11]:
def get_vocabulary_size(df, column_name):
    """
    Calculate the vocabulary size from a DataFrame column of tokenized text.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the text data.
        column_name (str): The name of the column with tokenized text (lists of words).

    Returns:
        int: The size of the vocabulary (number of unique words).
    """
    all_words = [word for tokens in df[column_name] for word in tokens]
    vocab = set(all_words)
    vocab_size = len(vocab)
    return vocab_size


In [12]:
sz = get_vocabulary_size(df, 'OriginalTweet')
sz

36864

In [13]:
from collections import Counter

def count_single_occurrence_tokens(df, column_name):
    # Flatten all tokens into one list
    all_tokens = [token for tokens in df[column_name] for token in tokens]
    
    # Count the frequency of each token
    token_counts = Counter(all_tokens)
    
    # Count how many tokens occur exactly once
    single_occurrence_count = sum(1 for token, count in token_counts.items() if count == 1)
    
    return single_occurrence_count

# Example usage
num_single_occurrence = count_single_occurrence_tokens(df, "OriginalTweet")
print("Number of tokens occurring exactly once:", num_single_occurrence)



Number of tokens occurring exactly once: 18926


In [14]:
import random
from collections import Counter

def replace_random_single_occurrence_tokens_with_unk(df, column_name, num_to_replace=500):
    # Flatten tokens
    all_tokens = [token for tokens in df[column_name] for token in tokens]
    
    # Count tokens frequency
    token_counts = Counter(all_tokens)
    
    # Find tokens occurring exactly once
    single_occurrence_tokens = [token for token, count in token_counts.items() if count == 1]
    
    print(f"Total tokens occurring once: {len(single_occurrence_tokens)}")
    
    # Randomly sample 500 tokens from single-occurrence tokens
    tokens_to_replace = random.sample(single_occurrence_tokens, num_to_replace)
    
    # Function to replace tokens in a list of tokens
    def replace_tokens(tokens):
        return ['UNK' if token in tokens_to_replace else token for token in tokens]
    
    # Replace tokens in the dataframe column
    df[column_name] = df[column_name].apply(replace_tokens)
    
    # Optional: recalculate vocabulary size
    all_tokens_after = [token for tokens in df[column_name] for token in tokens]
    new_vocab = set(all_tokens_after)
    new_vocab_size = len(new_vocab)
    
    print(f"Vocabulary size after replacement: {new_vocab_size}")
    return df, new_vocab_size

# Usage:
df, new_vocab_size = replace_random_single_occurrence_tokens_with_unk(df, "OriginalTweet", 500)


Total tokens occurring once: 18926
Vocabulary size after replacement: 36365


In [15]:
df

Unnamed: 0,OriginalTweet,Sentiment
0,"[and, and]",2
1,"[advice, talk, to, your, neighbours, family, t...",3
2,"[coronavirus, australia, woolworths, to, give,...",3
3,"[my, food, stock, is, not, the, only, one, whi...",3
4,"[me, ready, to, go, at, supermarket, during, t...",0
...,...,...
41152,"[airline, pilots, offering, to, stock, superma...",2
41153,"[response, to, complaint, not, provided, citin...",0
41154,"[you, know, its, getting, tough, when, is, rat...",3
41155,"[is, it, wrong, that, the, smell, of, hand, sa...",2


In [16]:
import math
def sigmoid(x):
    ans = []
    for i in range(len(x)):
        ans.append(1/(1+(math.exp(-x))))
    return ans

In [17]:
def softmax(x):
    ans = []
    val = 0
    for i in range(len(x)):
        val += math.exp(i)
    for j in range(len(x)):
        val2 = math.exp(j)/val
        ans.append(val2)
    return ans
    

In [18]:
def forward_prop(x): #hre x is a list
    sz = len(x)
    b1=[]
    b2=[]
    mat1=[]
    mat2=[]
    mat3=[]
    for i in range(sz):
        for j in range(3):
            mat1[i][j] = random.randint(0, 100)
    for k in range(3):
        for l in range(2):
            mat1[k][l] = random.randint(0, 100)
    for m in range(2):
        for n in range(5):
            mat1[m][n] = random.randint(0, 100)
    b1 = np.random.randint(1, 100, size=3)
    b2 = np.random.randint(1, 100, size=2)
    b3 = np.random.randint(1, 100, size=5)
    a1 = sigmoid((np.dot(mat1.T),x)+b1)
    a2 = sigmoid((np.dot(mat2.T),a1)+b2)
    a3 = softmax((np.dot(mat3.T),a2)+b3)
    return a3



