In [1]:
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('sentiwordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [3]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return ''

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    tagged = nltk.pos_tag(tokens)
    lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged]
    return lemmatized

def get_sentiment_score(word, pos):
    synsets = list(swn.senti_synsets(word, pos))
    if not synsets:
        return 0
    return synsets[0].pos_score() - synsets[0].neg_score()

def classify_sentiment(tokens):
    sentiment_score = sum(get_sentiment_score(word, pos) for word, pos in nltk.pos_tag(tokens))
    if sentiment_score > 0:
        return 'positive'
    elif sentiment_score < 0:
        return 'negative'
    else:
        return 'neutral'

def load_and_preprocess_data(file_path, num_samples=10000):
    df = pd.read_csv(file_path, encoding='latin-1', header=None, 
                     names=['sentiment', 'id', 'date', 'query', 'user', 'text'])
    df = df[['sentiment', 'text']].sample(n=num_samples, random_state=42)
    df['sentiment'] = df['sentiment'].map({0: 'negative', 4: 'positive'})
    df['processed_text'] = df['text'].apply(preprocess_text)
    return df

# Load your dataset here
data = load_and_preprocess_data('data.csv')
print(data.head())

KeyError: ''

In [None]:
# Apply sentiment classification
data['predicted_sentiment'] = data['processed_text'].apply(classify_sentiment)

# Evaluate the model
print(classification_report(data['sentiment'], data['predicted_sentiment']))

# Visualize confusion matrix
cm = confusion_matrix(data['sentiment'], data['predicted_sentiment'])
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Function to analyze a new document
def analyze_document(document):
    tokens = preprocess_text(document)
    sentiment = classify_sentiment(tokens)
    return sentiment

# Test the function with some example documents
documents = [
    "This product is amazing! I love it and would recommend it to everyone.",
    "Terrible experience. The customer service was awful and the product didn't work.",
    "The weather today is okay, not great but not bad either."
]

for doc in documents:
    print(f"Document: {doc}")
    print(f"Sentiment: {analyze_document(doc)}\n")