In [5]:
import nltk
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report

!pip install --upgrade pip
!pip install datasets


from datasets import load_dataset

# Load IMDb dataset
dataset = load_dataset("imdb")

# Convert to DataFrame
import pandas as pd

df = pd.DataFrame(dataset['train'])
df['sentiment'] = df['label']  # Rename column for clarity
print(df.head())  # Show sample data



                                                text  label  sentiment
0  I rented I AM CURIOUS-YELLOW from my video sto...      0          0
1  "I Am Curious: Yellow" is a risible and preten...      0          0
2  If only to avoid making this type of film in t...      0          0
3  This film was probably inspired by Godard's Ma...      0          0
4  Oh, brother...after hearing about this ridicul...      0          0


In [6]:
from datasets import load_dataset

# Load IMDb dataset
dataset = load_dataset("imdb")

# Check the dataset structure
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [7]:
import pandas as pd

# Convert dataset to Pandas DataFrame
train_data = pd.DataFrame(dataset['train'])
test_data = pd.DataFrame(dataset['test'])

# Check the first few rows
print(train_data.head())
print(test_data.head())


                                                text  label
0  I rented I AM CURIOUS-YELLOW from my video sto...      0
1  "I Am Curious: Yellow" is a risible and preten...      0
2  If only to avoid making this type of film in t...      0
3  This film was probably inspired by Godard's Ma...      0
4  Oh, brother...after hearing about this ridicul...      0
                                                text  label
0  I love sci-fi and am willing to put up with a ...      0
1  Worth the entertainment value of a rental, esp...      0
2  its a totally average film with a few semi-alr...      0
3  STAR RATING: ***** Saturday Night **** Friday ...      0
4  First off let me say, If you haven't enjoyed a...      0


In [8]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK datasets
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samridhigarg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/samridhigarg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
# Load stopwords once (optimization)
stop_words = set(stopwords.words('english'))

# Function to clean text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

# Apply function in a more optimized way
train_data['clean_text'] = train_data['text'].astype(str).apply(preprocess_text)
test_data['clean_text'] = test_data['text'].astype(str).apply(preprocess_text)


In [10]:
print(type(train_data))
print(type(test_data))
print(train_data.head())
print(test_data.head())

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
                                                text  label  \
0  I rented I AM CURIOUS-YELLOW from my video sto...      0   
1  "I Am Curious: Yellow" is a risible and preten...      0   
2  If only to avoid making this type of film in t...      0   
3  This film was probably inspired by Godard's Ma...      0   
4  Oh, brother...after hearing about this ridicul...      0   

                                          clean_text  
0  rented curious yellow video store controversy ...  
1  curious yellow risible pretentious steaming pi...  
2  avoid making type film future film interesting...  
3  film probably inspired godard masculin féminin...  
4  oh brother hearing ridiculous film umpteen yea...  
                                                text  label  \
0  I love sci-fi and am willing to put up with a ...      0   
1  Worth the entertainment value of a rental, esp...      0   
2  its a totally average f

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)  # Adjust based on data
X_train = vectorizer.fit_transform(train_data['clean_text'])
X_test = vectorizer.transform(test_data['clean_text'])


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, train_data['label'], test_size=0.2, random_state=42)


In [13]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)


In [14]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))


Accuracy: 0.8846
              precision    recall  f1-score   support

           0       0.90      0.87      0.88      2515
           1       0.87      0.90      0.89      2485

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



In [15]:
test_preds = model.predict(X_test)
test_data['predicted_label'] = test_preds


In [16]:
def predict_sentiment(text):
    cleaned_text = preprocess_text(text)  # Preprocess the text
    vectorized_text = vectorizer.transform([cleaned_text])  # Convert to TF-IDF
    prediction = model.predict(vectorized_text)[0]  # Predict sentiment
    return "Positive" if prediction == 1 else "Negative"


In [17]:
print(predict_sentiment("I love this movie, it's fantastic!"))  # Positive
print(predict_sentiment("This was a terrible experience."))  # Negative

Positive
Negative


In [18]:
while True:
    user_text = input("Enter a review (or type 'exit' to stop): ")
    if user_text.lower() == 'exit':
        print("Exiting sentiment analysis...")
        break
    print("Predicted Sentiment:", predict_sentiment(user_text))


Enter a review (or type 'exit' to stop):  happy day


Predicted Sentiment: Positive


Enter a review (or type 'exit' to stop):  exit


Exiting sentiment analysis...


In [40]:
import os
print(os.getcwd())  # This prints the current working directory


/Users/samridhigarg


In [51]:
os.chdir('/Users/samridhigarg/Desktop/projects')  # Set working directory
print(os.getcwd())  # Confirm the change


/Users/samridhigarg/Desktop/projects
