In [1]:
# !pip install pandas numpy scikit-learn nltk

import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 1. Load Sample Dataset
# For demo: using a small open Twitter dataset
data = pd.read_csv("https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv")
data = data[['tweet', 'label']]
data.columns = ['text', 'label']

# 2. Preprocess Text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@w+|\#','', text)  # for Removing @mentions and hashtags
    text = re.sub(r'[^A-Za-z\s]', '', text)  # for Removing punctuation
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

data['clean_text'] = data['text'].apply(clean_text)

# 3. Convert Text to TF-IDF Features
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['clean_text'])
y = data['label']

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# 6. Evaluate the Model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# 7. Test the Model
def predict_sentiment(sentence):
    sentence_clean = clean_text(sentence)
    vector = vectorizer.transform([sentence_clean])
    prediction = model.predict(vector)[0]
    sentiment = 'Positive' if prediction == 0 else 'Negative'
    return sentiment

# Testing example
test_text = "I love this product! It's fantastic."
print(f"Input: {test_text}")
print("Predicted Sentiment:", predict_sentiment(test_text))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Accuracy: 0.9496324104489285
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      5937
           1       0.91      0.32      0.48       456

    accuracy                           0.95      6393
   macro avg       0.93      0.66      0.73      6393
weighted avg       0.95      0.95      0.94      6393

Input: I love this product! It's fantastic.
Predicted Sentiment: Positive
