In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...


True

# Initialize stopwords and lemmatizer

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load the dataset

In [8]:
df = pd.read_csv(r'C:\Users\musta\OneDrive\Desktop\tweet_emotions.csv\tweet_emotions.csv')   

# Function to preprocess text

In [9]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, URLs, and numbers
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    # Rejoin the words into a single string
    return ' '.join(words)

# Apply preprocessing to the 'content' column

In [10]:
df['cleaned_content'] = df['content'].apply(preprocess_text)

# Display the first few rows

In [11]:
print(df.head())

     tweet_id   sentiment                                            content  \
0  1956967341       empty  @tiffanylue i know  i was listenin to bad habi...   
1  1956967666     sadness  Layin n bed with a headache  ughhhh...waitin o...   
2  1956967696     sadness                Funeral ceremony...gloomy friday...   
3  1956967789  enthusiasm               wants to hang out with friends SOON!   
4  1956968416     neutral  @dannycastillo We want to trade with someone w...   

                                     cleaned_content  
0  tiffanylue know listenin bad habit earlier sta...  
1             layin n bed headache ughhhhwaitin call  
2                      funeral ceremonygloomy friday  
3                              want hang friend soon  
4  dannycastillo want trade someone houston ticke...  


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer

In [14]:
vectorizer = CountVectorizer()

# Apply BoW to the cleaned_content column

In [15]:
X_bow = vectorizer.fit_transform(df['cleaned_content'])

# Display the vocabulary size

In [16]:
print(f"Vocabulary Size: {len(vectorizer.vocabulary_)}")


Vocabulary Size: 47437


# Convert sparse matrix to dense DataFrame (optional, for inspection)


In [17]:
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())


# Display the first few rows of the BoW representation


In [18]:
print(bow_df.head())

   aa  aaa  aaaa  aaaaa  aaaaaa  aaaaaaaa  aaaaaaaaaaa  aaaaaaaaaahhhhhhhh  \
0   0    0     0      0       0         0            0                   0   
1   0    0     0      0       0         0            0                   0   
2   0    0     0      0       0         0            0                   0   
3   0    0     0      0       0         0            0                   0   
4   0    0     0      0       0         0            0                   0   

   aaaaaaaaaamazing  aaaaaaaafternoon  ...  zyrtec  zzerbe  zzwhitejd  zzybug  \
0                 0                 0  ...       0       0          0       0   
1                 0                 0  ...       0       0          0       0   
2                 0                 0  ...       0       0          0       0   
3                 0                 0  ...       0       0          0       0   
4                 0                 0  ...       0       0          0       0   

   zzz  zzzz  zzzzy  zzzzz  zzzzzzzgoodnight