<a href="https://colab.research.google.com/github/sparshdubey95/MLmodels/blob/main/Sentiment_Analysis_on_1_6_Million_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Define column names based on Sentiment140 dataset structure
columns = ['target', 'id', 'date', 'flag', 'user', 'text']

# Load the dataset
df = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding='latin-1', names=columns)

# Map target labels from 0 and 4 to 0 and 1 for binary classification
df['target'] = df['target'].map({0: 0, 4: 1})

# Verify the data
print("Dataset shape:", df.shape)
print("Target distribution:\n", df['target'].value_counts())

In [None]:
def clean_text(text):
    """Clean the tweet text by removing URLs, mentions, hashtags, and punctuation."""
    text = text.lower()                         # Convert to lowercase
    text = re.sub(r'http\S+', '', text)         # Remove URLs
    text = re.sub(r'@\w+', '', text)           # Remove mentions
    text = re.sub(r'#\w+', '', text)           # Remove hashtags
    text = re.sub(r'[^a-z\s]', '', text)       # Remove punctuation, keep letters and spaces
    return text.strip()

# Example of applying it (we'll use it in vectorization)
print("Sample cleaning:")
sample_text = df['text'].iloc[0]
print("Original:", sample_text)
print("Cleaned:", clean_text(sample_text))

In [None]:
# Define features (X) and labels (y)
X = df['text']
y = df['target']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])

In [None]:
# Initialize TF-IDF Vectorizer with custom pre-processing
vectorizer = TfidfVectorizer(
    preprocessor=clean_text,
    max_features=5000,       # Limit to top 5000 features for efficiency
    stop_words='english'     # Remove common English stop words
)

# Fit and transform the training data, transform the testing data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("TF-IDF training matrix shape:", X_train_tfidf.shape)
print("TF-IDF testing matrix shape:", X_test_tfidf.shape)

In [None]:
# Initialize and train Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

# Predict on test set
y_pred_lr = lr_model.predict(X_test_tfidf)

In [None]:
# Initialize and train Multinomial Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict on test set
y_pred_nb = nb_model.predict(X_test_tfidf)

In [None]:
# Accuracy
lr_accuracy = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy:", lr_accuracy)

# Classification Report
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr, target_names=['Negative', 'Positive']))

In [None]:
# Accuracy
nb_accuracy = accuracy_score(y_test, y_pred_nb)
print("Naive Bayes Accuracy:", nb_accuracy)

# Classification Report
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb, target_names=['Negative', 'Positive']))