<a href="https://colab.research.google.com/github/s-im-ran/Projects/blob/main/Sentiment_Analysis_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Sentiment analysis using Logistic  Regression

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import numpy as np


In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def process(tweet):
    tokens = word_tokenize(tweet)
    tokens = [token.lower() for token in tokens]
    stop_words = set(stopwords.words('english'))
    fil_tokens = [token for token in tokens if token not in stop_words]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in fil_tokens]
    preprocessed_tweet = ' '.join(stemmed_tokens)
    return preprocessed_tweet

In [None]:
def build_freqs(tweets, sy):
    freqs = {}

    for y, tweet in zip(sy, tweets):
        preprocessed_tweet = process(tweet)
        words = preprocessed_tweet.split()

        for word in words:
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs


In [None]:
import nltk
nltk.download("twitter_samples")
from nltk.corpus import twitter_samples

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


In [None]:

pos=twitter_samples.strings('positive_tweets.json')
neg=twitter_samples.strings('negative_tweets.json')

In [None]:
test_pos=pos[4000:]
train_pos=pos[:4000]
test_neg=neg[4000:]
train_neg=neg[:4000]


In [None]:
train_x=train_pos+train_neg
test_x=test_pos+test_neg

In [None]:
train_y = [1] * len(train_pos) + [0] * len(train_neg)
test_y = [1] * len(test_pos) + [0] * len(test_neg)

In [None]:
freqs = build_freqs(train_x, train_y)
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 18658


In [None]:
print(len(freqs.keys()))

18658


In [None]:
print('A positive tweet: \n', train_x[22])
print('Processed version of the tweet: \n', process(train_x[22]))

A positive tweet: 
 @gculloty87 Yeah I suppose she was lol! Chat in a bit just off out x :))
Processed version of the tweet: 
 @ gculloty87 yeah suppos lol ! chat bit x : ) )


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
vectorizer = CountVectorizer()
train_x_vectorized = vectorizer.fit_transform(train_x)
test_x_vectorized = vectorizer.transform(test_x)

In [None]:
model = LogisticRegression()
model.fit(train_x_vectorized, train_y)

In [None]:
pred_y = model.predict(test_x_vectorized)

In [None]:
accuracy = accuracy_score(test_y, pred_y)
print(f"Accuracy: {accuracy:.2f}")

class_report = classification_report(test_y, pred_y)
print("Classification Report:\n", class_report)

conf_matrix = confusion_matrix(test_y, pred_y)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.76
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.79      0.77      1000
           1       0.78      0.73      0.75      1000

    accuracy                           0.76      2000
   macro avg       0.76      0.76      0.76      2000
weighted avg       0.76      0.76      0.76      2000

Confusion Matrix:
 [[792 208]
 [272 728]]


In [None]:
#checking the sentiment of a random tweet from the testing set
import random
random_tweet_index = random.randint(0, len(test_x) - 1)
random_tweet = test_x[random_tweet_index]
print(random_tweet)
preprocessed_random_tweet = process(random_tweet)
random_tweet_vectorized = vectorizer.transform([preprocessed_random_tweet])
sentiment_prediction = model.predict(random_tweet_vectorized)
if(sentiment_prediction==1):
  print('positive sentiment')
else:
  print('negative sentiment')

Excited for the weekend :-)
negative sentiment


In [None]:
#checking the sentiment of a random tweet as input
inp1='I love to be close by my parents.'
inp='i love school'
preprocessed_input =process(inp)
input_vectorized = vectorizer.transform([preprocessed_input])
sentiment_prediction = model.predict(input_vectorized)
if sentiment_prediction==1:
  print('Positive sentiment')
elif sentiment_prediction == 0:
    print('Negative sentiment')

Positive sentiment
