# Text Preprocessing

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
df = pd.read_csv(r'C:\Users\musta\OneDrive\Desktop\tweet_emotions.csv\tweet_emotions.csv')   

In [5]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, URLs, and numbers
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    # Rejoin the words into a single string
    return ' '.join(words)

In [6]:
df['cleaned_content'] = df['content'].apply(preprocess_text)

In [7]:
print(df.head())

     tweet_id   sentiment                                            content  \
0  1956967341       empty  @tiffanylue i know  i was listenin to bad habi...   
1  1956967666     sadness  Layin n bed with a headache  ughhhh...waitin o...   
2  1956967696     sadness                Funeral ceremony...gloomy friday...   
3  1956967789  enthusiasm               wants to hang out with friends SOON!   
4  1956968416     neutral  @dannycastillo We want to trade with someone w...   

                                     cleaned_content  
0  tiffanylue know listenin bad habit earlier sta...  
1             layin n bed headache ughhhhwaitin call  
2                      funeral ceremonygloomy friday  
3                              want hang friend soon  
4  dannycastillo want trade someone houston ticke...  


# Bow

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Apply Bag of Words (BoW)


In [9]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned_content'])

# Step 2: Train-Test Split


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)


# Step 3: Train a Model


In [11]:
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 4: Make Predictions

In [12]:
y_pred = model.predict(X_test)


# Step 5: Evaluate the Model


In [13]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

Accuracy: 0.318

Classification Report:
               precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       0.00      0.00      0.00       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.00      0.00      0.00       338
   happiness       0.32      0.33      0.33      1028
        hate       0.45      0.02      0.04       268
        love       0.49      0.32      0.39       762
     neutral       0.34      0.37      0.36      1740
      relief       0.00      0.00      0.00       352
     sadness       0.32      0.14      0.19      1046
    surprise       0.19      0.01      0.01       425
       worry       0.28      0.70      0.40      1666

    accuracy                           0.32      8000
   macro avg       0.18      0.14      0.13      8000
weighted avg       0.29      0.32      0.27      8000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# TF - IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [15]:
# Assuming `cleaned_content` is your preprocessed text and `sentiment` is the target column
X = df['cleaned_content']
y = df['sentiment']

# Apply TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_tfidf = tfidf_vectorizer.fit_transform(X)


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [17]:
model = MultinomialNB
model.fit(X_train, y_train)


In [18]:
y_pred = model.predict(X_test)


In [19]:
# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.34725
Classification Report:
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       0.33      0.01      0.01       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.12      0.01      0.03       338
   happiness       0.34      0.35      0.34      1028
        hate       0.51      0.16      0.24       268
        love       0.51      0.38      0.44       762
     neutral       0.34      0.57      0.42      1740
      relief       0.35      0.02      0.04       352
     sadness       0.34      0.25      0.29      1046
    surprise       0.31      0.05      0.09       425
       worry       0.33      0.48      0.39      1666

    accuracy                           0.35      8000
   macro avg       0.27      0.18      0.18      8000
weighted avg       0.34      0.35      0.31      8000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
