<a href="https://colab.research.google.com/github/sudharshanreddy4567/Sentemintal_analysis/blob/main/Untitled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the Sentiment140 dataset
# Dataset can be downloaded from Kaggle: https://www.kaggle.com/datasets/kazanova/sentiment140
# The dataset has the following columns:
# 0 - target (0 = negative, 2 = neutral, 4 = positive)
# 1 - ids
# 2 - date
# 3 - flag
# 4 - user
# 5 - text

# Define column names
columns = ['target', 'id', 'date', 'flag', 'user', 'text']

# Load the dataset (adjust the path as needed)
try:
    df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', names=columns)
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Please download the dataset from Kaggle and update the file path.")
    # Sample fallback with small data (in case dataset not found)
    data = {
        'text': ['I love this product!', 'This is terrible.', 'It is okay, not great.'],
        'target': [4, 0, 2]
    }
    df = pd.DataFrame(data)

# Map target values to sentiment categories
# In Sentiment140, 0=negative, 2=neutral, 4=positive
df['sentiment'] = df['target'].map({0: 'negative', 2: 'neutral', 4: 'positive'})

# For this example, let's work with a smaller sample for faster processing
df = df.sample(frac=0.1, random_state=42)  # Using 10% of the data for demonstration

# Text preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = text.split()
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back to string
    return ' '.join(tokens)

# Apply preprocessing to text data
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Split data into train and test sets
X = df['cleaned_text']
y = df['target'].replace({4: 1, 2: 1, 0: 0})  # Converting to binary: positive/neutral=1, negative=0

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)  # Using top 5000 features
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Convert sparse matrices to dense arrays
X_train_tfidf = X_train_tfidf.toarray()
X_test_tfidf = X_test_tfidf.toarray()

# Build Artificial Neural Network (ANN) model
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train_tfidf, y_train,
    validation_data=(X_test_tfidf, y_test),
    epochs=15,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

# Plot training history
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.tight_layout()
plt.show()

# Evaluate the model
y_pred_prob = model.predict(X_test_tfidf)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nModel Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'positive/neutral']))

# Example prediction
sample_texts = [
    "I love this product! It's amazing!",
    "This is the worst experience ever.",
    "The service was okay, nothing special."
]

# Preprocess sample texts
cleaned_samples = [preprocess_text(text) for text in sample_texts]
sample_tfidf = tfidf.transform(cleaned_samples).toarray()

# Predict sentiment
sample_preds = (model.predict(sample_tfidf) > 0.5).astype(int)

print("\nSample Predictions:")
for text, pred in zip(sample_texts, sample_preds):
    sentiment = 'positive/neutral' if pred == 1 else 'negative'
    print(f"Text: {text}\nSentiment: {sentiment}\n")