In [6]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Load the cleaned data from 'cleaned_reviews.csv' 
df = pd.read_csv('cleaned_reviews.csv')

# Initialize stopwords and lemmatizer
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.WordNetLemmatizer()

# Function to preprocess the text
def preprocess_text(text, stop_words, lemmatizer):
    words = nltk.word_tokenize(text.lower())  
    words = [word for word in words if word not in stop_words]  
    words = [lemmatizer.lemmatize(word) for word in words]  
    return ' '.join(words)

# Apply preprocessing to the 'CleanedText' column 
df['ProcessedText'] = df['CleanedText'].apply(lambda text: preprocess_text(text, stop_words, lemmatizer))

# Categorize sentiment based on the 'Liked' score (1 = positive, 0 = negative)
def categorize_sentiment(score):
    if score == 1:
        return 'positive'
    else:
        return 'negative'

# Create the 'Sentiment' column based on the 'Liked' column
df['Sentiment'] = df['Liked'].apply(categorize_sentiment)

# TF-IDF Vectorizer to convert text data into numerical features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['ProcessedText'])

# Encode sentiment labels as numeric values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Sentiment'])

# Save the DataFrame with the new 'Sentiment' column back to 'cleaned_reviews.csv'
df.to_csv('cleaned_reviews.csv', index=False)

print(df.head())  


                                              Review  Liked  \
0                           Wow... Loved this place.      1   
1                                 Crust is not good.      0   
2          Not tasty and the texture was just nasty.      0   
3  Stopped by during the late May bank holiday of...      1   
4  The selection on the menu was great and so wer...      1   

                                         CleanedText  \
0                               Wow Loved this place   
1                                  Crust is not good   
2           Not tasty and the texture was just nasty   
3  Stopped by during the late May bank holiday of...   
4  The selection on the menu was great and so wer...   

                                       ProcessedText Sentiment  
0                                    wow loved place  positive  
1                                         crust good  negative  
2                                tasty texture nasty  negative  
3  stopped late may bank