# Text Preprocessing

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
df = pd.read_csv(r'C:\Users\musta\OneDrive\Desktop\tweet_emotions.csv\tweet_emotions.csv')   

In [5]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, URLs, and numbers
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    # Rejoin the words into a single string
    return ' '.join(words)

In [6]:
df['cleaned_content'] = df['content'].apply(preprocess_text)

In [None]:
print(df.head())

# Bow

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Apply Bag of Words (BoW)


In [9]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned_content'])

# Step 2: Train-Test Split


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)


# Step 3: Train a Model


In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 4: Make Predictions

In [12]:
y_pred = model.predict(X_test)


# Step 5: Evaluate the Model


In [None]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

# TF - IDF

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [24]:
# Assuming `cleaned_content` is your preprocessed text and `sentiment` is the target column
X = df['cleaned_content']
y = df['sentiment']

# Apply TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_tfidf = tfidf_vectorizer.fit_transform(X)


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=0)


In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)


In [27]:
y_pred = model.predict(X_test)


In [None]:
# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


# Word2vec

# CBOW

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec


# Step 1: Load Dataset


In [None]:
file_path = r'C:\Users\musta\OneDrive\Desktop\tweet_emotions.csv\tweet_emotions.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Display dataset
print("Dataset preview:")
print(df.head())


# Step 2: Preprocessing

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')


In [14]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [15]:
def preprocess_text(text):
    # Remove special characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords and apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing to the content column

In [16]:
# Apply preprocessing to the content column
df['cleaned_content'] = df['content'].apply(preprocess_text)

# Prepare sentences for Word2Vec

In [17]:
sentences = df['cleaned_content'].tolist()

# Display a sample of tokenized sentences

In [None]:
print("\nSample tokenized sentences:")
print(sentences[:5])

# Step 3: Train Word2Vec - CBOW

In [None]:
print("\nTraining CBOW Word2Vec model...")
cbow_model = Word2Vec(sentences=sentences, vector_size=300, window=5, min_count=2, sg=0, epochs=10)


# Save CBOW model


In [None]:
cbow_model.save('word2vec_cbow.model')
print("CBOW model saved as 'word2vec_cbow.model'.")


# Step 4: Train Word2Vec - Skip-Gram

In [None]:
print("\nTraining Skip-Gram Word2Vec model...")
skipgram_model = Word2Vec(sentences=sentences, vector_size=300, window=5, min_count=2, sg=1, epochs=10)



# Save Skip-Gram model


In [None]:
skipgram_model.save('word2vec_skipgram.model')
print("Skip-Gram model saved as 'word2vec_skipgram.model'.")


# Step 5: Inspect Trained Models


# Load saved models


In [23]:
loaded_cbow = Word2Vec.load('word2vec_cbow.model')
loaded_skipgram = Word2Vec.load('word2vec_skipgram.model')


# Check similar words


In [None]:
print("\nWords similar to 'sleep' using CBOW:")
print(loaded_cbow.wv.most_similar('sleep', topn=10))


In [None]:
print("\nWords similar to 'sleep' using Skip-Gram:")
print(loaded_skipgram.wv.most_similar('sleep', topn=10))


# Inspect word vector


In [None]:
print("\nVector for 'sleep' using CBOW:")
print(loaded_cbow.wv['sleep'])



In [None]:
print("\nVector for 'sleep' using Skip-Gram:")
print(loaded_skipgram.wv['sleep'])
