# Text Preprocessing

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
df = pd.read_csv(r'C:\Users\musta\OneDrive\Desktop\tweet_emotions.csv\tweet_emotions.csv')   

In [5]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, URLs, and numbers
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    # Rejoin the words into a single string
    return ' '.join(words)

In [6]:
df['cleaned_content'] = df['content'].apply(preprocess_text)

In [None]:
print(df.head())

# Bow

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Apply Bag of Words (BoW)


In [9]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned_content'])

# Step 2: Train-Test Split


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)


# Step 3: Train a Model


In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 4: Make Predictions

In [12]:
y_pred = model.predict(X_test)


# Step 5: Evaluate the Model


In [None]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

# TF - IDF

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [24]:
# Assuming `cleaned_content` is your preprocessed text and `sentiment` is the target column
X = df['cleaned_content']
y = df['sentiment']

# Apply TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_tfidf = tfidf_vectorizer.fit_transform(X)


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=0)


In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)


In [27]:
y_pred = model.predict(X_test)


In [None]:
# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


# Word2vec

# CBOW

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec


# Step 1: Load Dataset


In [12]:
file_path = r'C:\Users\musta\OneDrive\Desktop\tweet_emotions.csv\tweet_emotions.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Display dataset
print("Dataset preview:")
print(df.head())


Dataset preview:
     tweet_id   sentiment                                            content
0  1956967341       empty  @tiffanylue i know  i was listenin to bad habi...
1  1956967666     sadness  Layin n bed with a headache  ughhhh...waitin o...
2  1956967696     sadness                Funeral ceremony...gloomy friday...
3  1956967789  enthusiasm               wants to hang out with friends SOON!
4  1956968416     neutral  @dannycastillo We want to trade with someone w...


# Step 2: Preprocessing

In [13]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\musta\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [14]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [15]:
def preprocess_text(text):
    # Remove special characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords and apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing to the content column

In [16]:
# Apply preprocessing to the content column
df['cleaned_content'] = df['content'].apply(preprocess_text)

# Prepare sentences for Word2Vec

In [17]:
sentences = df['cleaned_content'].tolist()

# Display a sample of tokenized sentences

In [18]:
print("\nSample tokenized sentences:")
print(sentences[:5])


Sample tokenized sentences:
[['tiffanylue', 'know', 'listenin', 'bad', 'habit', 'earlier', 'started', 'freakin', 'part'], ['layin', 'n', 'bed', 'headache', 'ughhhhwaitin', 'call'], ['funeral', 'ceremonygloomy', 'friday'], ['want', 'hang', 'friend', 'soon'], ['dannycastillo', 'want', 'trade', 'someone', 'houston', 'ticket', 'one']]


# Step 3: Train Word2Vec - CBOW

In [19]:
print("\nTraining CBOW Word2Vec model...")
cbow_model = Word2Vec(sentences=sentences, vector_size=300, window=5, min_count=2, sg=0, epochs=10)



Training CBOW Word2Vec model...


# Save CBOW model


In [20]:
cbow_model.save('word2vec_cbow.model')
print("CBOW model saved as 'word2vec_cbow.model'.")


CBOW model saved as 'word2vec_cbow.model'.


# Step 4: Train Word2Vec - Skip-Gram

In [21]:
print("\nTraining Skip-Gram Word2Vec model...")
skipgram_model = Word2Vec(sentences=sentences, vector_size=300, window=5, min_count=2, sg=1, epochs=10)




Training Skip-Gram Word2Vec model...


# Save Skip-Gram model


In [22]:
skipgram_model.save('word2vec_skipgram.model')
print("Skip-Gram model saved as 'word2vec_skipgram.model'.")


Skip-Gram model saved as 'word2vec_skipgram.model'.


# Step 5: Inspect Trained Models


# Load saved models


In [23]:
loaded_cbow = Word2Vec.load('word2vec_cbow.model')
loaded_skipgram = Word2Vec.load('word2vec_skipgram.model')


# Check similar words


In [26]:
print("\nWords similar to 'sleep' using CBOW:")
print(loaded_cbow.wv.most_similar('sleep', topn=10))



Words similar to 'happy' using CBOW:
[('bed', 0.8746517896652222), ('work', 0.8586323857307434), ('asleep', 0.8485585451126099), ('wake', 0.8461429476737976), ('takn', 0.8290516138076782), ('stay', 0.8240368962287903), ('pm', 0.819888710975647), ('bex', 0.8160650134086609), ('alone', 0.8144768476486206), ('tmrw', 0.8131993412971497)]


In [27]:
print("\nWords similar to 'sleep' using Skip-Gram:")
print(loaded_skipgram.wv.most_similar('sleep', topn=10))



Words similar to 'happy' using Skip-Gram:
[('wide', 0.7662756443023682), ('woke', 0.7578635215759277), ('alarm', 0.7574796080589294), ('migraine', 0.7349479794502258), ('morn', 0.7315568327903748), ('awake', 0.7260857820510864), ('bed', 0.7243674993515015), ('ache', 0.7242431044578552), ('sleeping', 0.7222613096237183), ('nighty', 0.7218272089958191)]


# Inspect word vector


In [29]:
print("\nVector for 'sleep' using CBOW:")
print(loaded_cbow.wv['sleep'])




Vector for 'sleep' using CBOW:
[ 1.20704107e-01  7.64020443e-01 -6.73619688e-01 -4.14599568e-01
  2.29132157e-02 -9.09570932e-01  6.18406773e-01  1.16969275e+00
 -6.28199458e-01  3.27487588e-01  5.96928596e-02 -2.62963295e-01
 -7.19414711e-01  2.93196261e-01  3.86416644e-01 -5.98978400e-01
  5.41918874e-01 -6.65719688e-01 -2.04737946e-01 -6.32874250e-01
  5.37055194e-01  9.27321017e-01  2.42641672e-01 -1.10293292e-02
  5.21759212e-01 -4.43845958e-01 -2.17715241e-02  7.10286021e-01
 -7.15177476e-01 -8.32669258e-01 -2.45985270e-01  2.76036173e-01
 -8.55580196e-02  5.21364927e-01 -6.93235457e-01  1.58437997e-01
  5.47635257e-01  8.65095407e-02 -1.83078691e-01  4.29074131e-02
 -4.02789235e-01 -1.02163649e+00 -8.46612155e-02 -1.33918971e-01
  4.52159137e-01  3.41907650e-01 -1.03643787e+00 -1.64258465e-01
  7.41261423e-01  3.73490788e-02 -1.97735086e-01 -2.69606560e-01
 -1.39355823e-01  3.33170772e-01  2.66192015e-02  1.43694758e-01
  5.34660071e-02 -8.36372852e-01  6.47294700e-01 -1.340699

In [30]:
print("\nVector for 'sleep' using Skip-Gram:")
print(loaded_skipgram.wv['sleep'])



Vector for 'sleep' using Skip-Gram:
[ 1.07165389e-01  1.25611112e-01 -6.80771843e-02 -1.87418744e-01
  1.10103078e-01 -2.57751346e-01 -1.62744597e-02  3.67997169e-01
 -9.56075564e-02  1.65430471e-01 -2.22483762e-02 -1.86914802e-02
 -3.31699520e-01 -1.36216208e-02  1.95318148e-01 -1.77527875e-01
 -1.96722552e-01 -4.05768782e-01  2.69870888e-02 -3.84183645e-01
  1.58398032e-01  3.74577016e-01 -7.81415254e-02  5.91156892e-02
  3.02761674e-01 -2.55196154e-01  9.82362330e-02  1.19395040e-01
 -2.59588480e-01 -2.26849511e-01 -2.70252734e-01  2.47389480e-01
 -2.07168199e-02  8.60508308e-02 -9.89407003e-02  1.36240646e-01
  2.64751792e-01 -1.18487366e-02 -1.73611548e-02 -2.72767413e-02
 -8.75813887e-02 -4.02028412e-01  5.53671196e-02  9.66536850e-02
  2.62171060e-01  1.78386956e-01 -5.96064664e-02  2.54169762e-01
  4.10596699e-01 -1.81573015e-02 -1.36357009e-01 -1.08233258e-01
  9.16432738e-02  1.46060139e-01 -2.13898122e-01  2.14502588e-01
 -1.38023540e-01 -6.82974979e-02  1.60141096e-01 -1.5