####  VADER (Valence Aware Dictionary and sEntiment Reasoner) pre-trained model for sentiment analysis 

#### Use the SentimentIntensityAnalyzer class from the library to analyze the sentiment of your text data

####  VADER is a pre-trained model, which means it has already been trained on a large dataset of text samples to understand the sentiment of words and phrases. You can directly use it to analyze the sentiment of your text data without needing to train it on your specific dataset.

#### Studies have shown VADER can outperform humans in classifying sentiment on social media text.
#### It achieves an F1 score of 0.96,  meaning it's very good at correctly identifying positive, neutral, and negative sentiment.

#### Reported accuracy can vary depending on the testing method. Some sources say it can reach over 90% accuracy on specific tasks, 
#### while others report accuracy in the 60% range for overall sentiment analysis 

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Function for preprocessing text data
def preprocess_text(df, text_column):
    # Lowercase the text
    df[text_column] = df[text_column].str.lower()
    
    # Remove special characters, punctuation, and symbols
    def remove_special_characters(text):
        pattern = r'[^a-zA-Z0-9\s]'  # Define regex pattern
        return re.sub(pattern, '', text)
    
    df[text_column] = df[text_column].apply(remove_special_characters)
    
    # Remove numbers
    def remove_numbers(text):
        pattern = r'\b\d+\b'  # Define regex pattern to match any standalone number
        return re.sub(pattern, '', text)
    
    df[text_column] = df[text_column].apply(remove_numbers)
    
    # Download stopwords list
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    
    # Remove stopwords
    def remove_stopwords(text):
        cleaned_text = ' '.join([word for word in text.split() if word not in stop_words])
        return cleaned_text
    
    df[text_column] = df[text_column].apply(remove_stopwords)
    
    # Tokenize text
    nltk.download('punkt')
    def tokenize_text(text):
        return word_tokenize(text)
    
    df['token'] = df[text_column].apply(tokenize_text)
    
    # Stemming
    stemmer = PorterStemmer()
    def stem_tokens(tokens):
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        return stemmed_tokens
    
    df['stemmed_token'] = df['token'].apply(stem_tokens)
    
    # Lemmatization
    nltk.download('wordnet')
    lemmatizer = WordNetLemmatizer()
    def lemmatize_tokens(tokens):
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return lemmatized_tokens
    
    df['lemmatized_token'] = df['stemmed_token'].apply(lemmatize_tokens)
    
    # Combine tokens into cleaned text
    df['cleaned_text'] = df['lemmatized_token'].apply(lambda tokens: ' '.join(tokens))
    
    return df

# Load data
data = pd.read_csv("hate.csv" , encoding='latin1')  

# Preprocess text data
data = preprocess_text(data, 'comment')


[nltk_data] Downloading package stopwords to C:\Users\Prayas
[nltk_data]     jain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Prayas
[nltk_data]     jain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Prayas
[nltk_data]     jain\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment scores
def get_sentiment_scores(text):
    # Analyze sentiment
    sentiment_scores = analyzer.polarity_scores(text)
    return sentiment_scores

# Apply VADER to your dataset
data['sentiment_scores'] = data['cleaned_text'].apply(get_sentiment_scores)

# Extract compound score
data['compound_score'] = data['sentiment_scores'].apply(lambda x: x['compound'])

# Classify sentiment based on compound score
data['sentiment'] = data['compound_score'].apply(lambda score: 'positive' if score >= 0 else 'negative')

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['sentiment'], test_size=0.2, random_state=42)


In [8]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download VADER lexicon
nltk.download('vader_lexicon')

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to preprocess text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Tokenization (split the text into words/tokens)
    tokens = nltk.word_tokenize(text)
    
    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]
    
    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Take input from the user
user_input = input("Enter a sentence: ")

# Preprocess the input text
preprocessed_input = preprocess_text(user_input)

# Analyze sentiment using VADER
sentiment_scores = analyzer.polarity_scores(preprocessed_input)

# Determine overall sentiment
if sentiment_scores['compound'] >= 0.05:
    print("Overall sentiment: Positive")
elif sentiment_scores['compound'] <= -0.05:
    print("Overall sentiment: Negative")
else:
    print("Overall sentiment: Neutral")


[nltk_data] Downloading package vader_lexicon to C:\Users\Prayas
[nltk_data]     jain\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Enter a sentence:  I love you


Overall sentiment: Positive
