In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

In [2]:
data = pd.read_csv("train_nykaa_review_sentiment.csv")

In [3]:
reviews = data['content'][:1500]

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
#Function for text preprocessing
def preprocess_text(text):

    text = re.sub(r'\d+','', text) # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    text = text.lower() # Convert text to lowercase
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')]) # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()]) # Lemmatization
    return text

#Apply preprocessing to each review
preprocessed_reviews = [preprocess_text(review) for review in reviews]

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

#Fit-transform the preprocessed reviews
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_reviews)

# Convert TF-IDF matrix to DataFrame for visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray())


print(tfidf_df)

      0     1     2     3     4     5     6     7     8     9     ...  1638  \
0      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
1      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
2      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
3      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
4      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
...    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
1495   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
1496   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
1497   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
1498   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
1499   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   

          1639  1640  1641  1642  1643  1644  1645 

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Encode sentiment labels
sentiment_mapping = {'POSITIVE': 1, 'NEGATIVE': 0, 'NEUTRAL': 2}
data['sentiment_encoded'] = data['sentiment_labels'].map(sentiment_mapping)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, data['sentiment_encoded'][:1500], test_size=0.15, random_state=42)

# Initialize and train logistic regression model
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)

# Predict sentiment labels for test data
y_pred = logistic_regression.predict(X_test)

In [10]:
#Ensure that the data splitting process is correct
print(len(X_train))
print(len(X_test))

print(y_test)
print(y_pred)

1275
225
1116    1
1368    2
422     2
413     1
451     0
       ..
1231    1
917     1
743     1
570     1
218     2
Name: sentiment_encoded, Length: 225, dtype: int64
[1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 0 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1]


In [11]:
#Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7511111111111111
