<a href="https://colab.research.google.com/github/singhvertika119/Sentiment-Analysis/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install kaggle tensorflow scikit-learn pandas matplotlib



In [None]:
from google.colab import files
files.upload()

Saving training.1600000.processed.noemoticon.csv.zip to training.1600000.processed.noemoticon.csv.zip


In [None]:
import pandas as pd

# Load dataset (6 columns exist in original file)
df = pd.read_csv("training.1600000.processed.noemoticon.csv.zip",
                 encoding="latin-1",
                 header=None)

# Keep only sentiment (col 0) and tweet text (col 5)
df = df[[0, 5]]
df.columns = ["sentiment", "tweet"]

# Map labels: 0=negative, 2=neutral, 4=positive → 0,1,2
df["sentiment"] = df["sentiment"].map({0: 0, 2: 1, 4: 2})

df.head()

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [None]:
import re

def clean_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+", "", tweet)   # remove URLs
    tweet = re.sub(r"@\w+", "", tweet)      # remove mentions
    tweet = re.sub(r"#\w+", "", tweet)      # remove hashtags
    tweet = re.sub(r"[^\w\s]", "", tweet)   # remove punctuation
    tweet = re.sub(r"\s+", " ", tweet).strip()
    return tweet

df["clean_tweet"] = df["tweet"].apply(clean_tweet)
df.head()

Unnamed: 0,sentiment,tweet,clean_tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww thats a bummer you shoulda got david carr...
1,0,is upset that he can't update his Facebook by ...,is upset that he cant update his facebook by t...
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball managed to sav...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....",no its not behaving at all im mad why am i her...


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 50000   # vocab size
max_len = 50        # max tweet length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df["clean_tweet"])

X = tokenizer.texts_to_sequences(df["clean_tweet"])
X = pad_sequences(X, maxlen=max_len)

y = df["sentiment"].values

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train = tf.keras.utils.to_categorical(y_train, 3)
y_test = tf.keras.utils.to_categorical(y_test, 3)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation="relu"),
    Dropout(0.5),
    Dense(3, activation="softmax")
])

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()



In [None]:
history = model.fit(X_train, y_train,
                    validation_split=0.1,
                    epochs=3,
                    batch_size=1024)

Epoch 1/3
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 108ms/step - accuracy: 0.7320 - loss: 0.5286 - val_accuracy: 0.8138 - val_loss: 0.4072
Epoch 2/3
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 107ms/step - accuracy: 0.8215 - loss: 0.3983 - val_accuracy: 0.8206 - val_loss: 0.3950
Epoch 3/3
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 107ms/step - accuracy: 0.8363 - loss: 0.3691 - val_accuracy: 0.8221 - val_loss: 0.3953


In [9]:
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.2f}")

[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m308s[0m 31ms/step - accuracy: 0.8208 - loss: 0.3947
Test Accuracy: 0.82


In [10]:
import numpy as np

def predict_sentiment(tweet):
    seq = tokenizer.texts_to_sequences([clean_tweet(tweet)])
    padded = pad_sequences(seq, maxlen=max_len)
    pred = model.predict(padded)
    labels = ["Negative", "Neutral", "Positive"]
    return labels[np.argmax(pred)], pred

print(predict_sentiment("I love this new phone, it's amazing!"))
print(predict_sentiment("This is the worst day ever."))
print(predict_sentiment("The weather is fine, nothing special."))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 416ms/step
('Positive', array([[5.0440906e-03, 1.5016947e-16, 9.9495596e-01]], dtype=float32))
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
('Negative', array([[9.8825091e-01, 7.2266815e-20, 1.1749051e-02]], dtype=float32))
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
('Positive', array([[4.1470784e-01, 2.2449085e-05, 5.8526969e-01]], dtype=float32))
