<a href="https://colab.research.google.com/github/shrinithisivaraman/shrinithi-portfolio/blob/main/Movie_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


In [None]:
# Step 3: Create a Simple Dataset
data = {
    'review': [
        "I love this movie",
        "Horrible acting",
        "What a great film",
        "Worst movie ever",
        "Really enjoyed it",
        "It was terrible",
        "Fantastic performance",
        "Not good at all"
    ],
    'label': [
        'positive',
        'negative',
        'positive',
        'negative',
        'positive',
        'negative',
        'positive',
        'negative'
    ]
}

# Convert to pandas DataFrame
df = pd.DataFrame(data)

# Display the dataset
df


Unnamed: 0,review,label
0,I love this movie,positive
1,Horrible acting,negative
2,What a great film,positive
3,Worst movie ever,negative
4,Really enjoyed it,positive
5,It was terrible,negative
6,Fantastic performance,positive
7,Not good at all,negative


In [None]:
# Step 4: Convert Text to Numeric Format

# 1. Create the vectorizer
vectorizer = CountVectorizer()

# 2. Convert the text reviews into numeric format
X = vectorizer.fit_transform(df['review'])

# 3. Store the labels (positive / negative)
y = df['label']

# Optional: Check what the data looks like
print("Shape of X (rows = reviews, columns = unique words):", X.shape)
print("\nVocabulary (word to column index):")
print(vectorizer.vocabulary_)
print("\nFirst row as numbers:", X.toarray()[0])
print("\nLabels:", list(y))


Shape of X (rows = reviews, columns = unique words): (8, 21)

Vocabulary (word to column index):
{'love': 11, 'this': 17, 'movie': 12, 'horrible': 9, 'acting': 0, 'what': 19, 'great': 8, 'film': 6, 'worst': 20, 'ever': 4, 'really': 15, 'enjoyed': 3, 'it': 10, 'was': 18, 'terrible': 16, 'fantastic': 5, 'performance': 14, 'not': 13, 'good': 7, 'at': 2, 'all': 1}

First row as numbers: [0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0]

Labels: ['positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative']


In [None]:
# Step 5: Split the Data for Training and Testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,     # 20% of data will be used for testing
    random_state=42     # ensures the split is the same every time
)

# Optional: Check the split
print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])
print("Training labels:", list(y_train))
print("Test labels:", list(y_test))


Training set size: 6
Test set size: 2
Training labels: ['positive', 'negative', 'positive', 'positive', 'negative', 'positive']
Test labels: ['negative', 'negative']


In [None]:
# Step 6: Train the Model
from sklearn.naive_bayes import MultinomialNB

# 1. Create the model
model = MultinomialNB()

# 2. Train the model using the training data
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.0


In [None]:
# Step 8: Test your own review
your_review = ["I like it"]  # <-- put your review here inside a list
your_review_vector = vectorizer.transform(your_review)
prediction = model.predict(your_review_vector)
print("Prediction:", prediction)


Prediction: ['positive']


In [None]:
# Step 8: Test your own review
your_review = ["the movie was bad"]  # <-- put your review here inside a list
your_review_vector = vectorizer.transform(your_review)
prediction = model.predict(your_review_vector)
print("Prediction:", prediction)


Prediction: ['positive']
