In [42]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources

In [43]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Initialize stopwords and lemmatizer

In [44]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load the dataset

In [45]:
df = pd.read_csv(r'C:\Users\musta\OneDrive\Desktop\tweet_emotions.csv\tweet_emotions.csv')   

# Function to preprocess text

In [46]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, URLs, and numbers
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    # Rejoin the words into a single string
    return ' '.join(words)

# Apply preprocessing to the 'content' column

In [47]:
df['cleaned_content'] = df['content'].apply(preprocess_text)

# Display the first few rows

In [48]:
print(df.head())

     tweet_id   sentiment                                            content  \
0  1956967341       empty  @tiffanylue i know  i was listenin to bad habi...   
1  1956967666     sadness  Layin n bed with a headache  ughhhh...waitin o...   
2  1956967696     sadness                Funeral ceremony...gloomy friday...   
3  1956967789  enthusiasm               wants to hang out with friends SOON!   
4  1956968416     neutral  @dannycastillo We want to trade with someone w...   

                                     cleaned_content  
0  tiffanylue know listenin bad habit earlier sta...  
1             layin n bed headache ughhhhwaitin call  
2                      funeral ceremonygloomy friday  
3                              want hang friend soon  
4  dannycastillo want trade someone houston ticke...  


In [49]:
from sklearn.feature_extraction.text import CountVectorizer


# Initialize CountVectorizer with n-gram range (e.g., bigrams and trigrams)


In [50]:
ngram_vectorizer = CountVectorizer(ngram_range=(2, 3), max_features=10000)

# Apply n-grams to the cleaned_content column


In [51]:
X_ngrams = ngram_vectorizer.fit_transform(df['cleaned_content'])


# Display the number of n-grams


In [52]:
print(f"Number of N-grams: {len(ngram_vectorizer.vocabulary_)}")


Number of N-grams: 10000


# Convert sparse matrix to dense DataFrame (optional, for inspection)


In [53]:
ngrams_df = pd.DataFrame(X_ngrams.toarray(), columns=ngram_vectorizer.get_feature_names_out())


# Display the first few rows of the n-gram representation


In [56]:
print(ngrams_df.head())


   able buy  able get  able go  able make  able see  able talk  able watch  \
0         0         0        0          0         0          0           0   
1         0         0        0          0         0          0           0   
2         0         0        0          0         0          0           0   
3         0         0        0          0         0          0           0   
4         0         0        0          0         0          0           0   

   absolutely amazing  absolutely love  absolutely nothing  ...  \
0                   0                0                   0  ...   
1                   0                0                   0  ...   
2                   0                0                   0  ...   
3                   0                0                   0  ...   
4                   0                0                   0  ...   

   youtube working  youve got  youve made  youve seen  yr ago  yr old  \
0                0          0           0           0  