<a href="https://colab.research.google.com/github/pray-ash911/Sentiment-Analysis-using-logistic-regression-and-distilbert/blob/main/Sentiment_analysis_using_two_diff_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# PART 1: SETUP AND INSTALLATION

# Install required libraries
!pip install transformers datasets torch scikit-learn matplotlib seaborn wordcloud pandas numpy nltk kagglehub -q

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import os
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, precision_score, recall_score

# Deep Learning libraries
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Text preprocessing libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Visualization
from wordcloud import WordCloud

print(" All libraries imported successfully!")

# Download ALL required NLTK data
print("\ Downloading NLTK data files...")
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-eng', quiet=True)
nltk.download('punkt_tab', quiet=True)
print("NLTK data downloaded successfully!")

  print("\ Downloading NLTK data files...")


 All libraries imported successfully!
\ Downloading NLTK data files...
NLTK data downloaded successfully!


[nltk_data] Error loading omw-eng: Package 'omw-eng' not found in
[nltk_data]     index


In [2]:
# PART 2: DATASET LOADING


print("Downloading Sentiment140 dataset from Kaggle...")

import kagglehub

# Download dataset
path = kagglehub.dataset_download("kazanova/sentiment140")
print(f"Dataset downloaded to: {path}")

# Find the CSV file in the downloaded directory
csv_path = None
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.csv'):
            csv_path = os.path.join(root, file)
            print(f"Found CSV file: {csv_path}")
            break
    if csv_path:
        break

# Load the dataset
df = pd.read_csv(csv_path,
                 encoding='latin-1',
                 header=None,
                 names=['target', 'id', 'date', 'flag', 'user', 'text'])

print(f" Dataset loaded successfully → {df.shape[0]:,} tweets")
print(f"   Columns: {list(df.columns)}")

# Rename 'target' to 'label' for consistency
df = df.rename(columns={'target': 'label'})

# Convert labels: 0=negative, 4=positive → 0=negative, 1=positive
df['label'] = df['label'].map({0: 0, 4: 1})

# Display dataset info
print("\ Dataset Information:")
print(f"   Total samples: {len(df):,}")
print(f"   Positive tweets (1): {df['label'].sum():,}")
print(f"   Negative tweets (0): {len(df) - df['label'].sum():,}")
print(f"   Class balance: {(df['label'].sum() / len(df) * 100):.1f}% positive")

# Show first few samples
print("\ Sample tweets:")
print(df.head())

Downloading Sentiment140 dataset from Kaggle...
Using Colab cache for faster access to the 'sentiment140' dataset.
Dataset downloaded to: /kaggle/input/sentiment140
Found CSV file: /kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv
 Dataset loaded successfully → 1,600,000 tweets
   Columns: ['target', 'id', 'date', 'flag', 'user', 'text']
\ Dataset Information:
   Total samples: 1,600,000
   Positive tweets (1): 800,000
   Negative tweets (0): 800,000
   Class balance: 50.0% positive
\ Sample tweets:
   label          id                          date      flag             user  \
0      0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  _TheSpecialOne_   
1      0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY    scotthamilton   
2      0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY         mattycus   
3      0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          ElleCTF   
4      0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY           