In [None]:

import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Load the Twitter dataset

data_path = '/mnt/data/Twitter_Data.csv'
df = pd.read_csv(data_path)

# Display the first few rows of the dataset

print("Dataset preview:")
print(df.head())

# Check for missing values

print("\nMissing values:")
print(df.isnull().sum())

# Preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Removing URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # Removing special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    words = word_tokenize(text)
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Joining words back to form the preprocessed text
    return ' '.join(words)

# Apply preprocessing to the 'tweet' column (assuming the column name is 'tweet')

df['cleaned_tweet'] = df['tweet'].apply(preprocess_text)

# Display the first few rows of the processed data

print("\nProcessed data preview:")
print(df[['tweet', 'cleaned_tweet', 'sentiment']].head())

# Save the preprocessed data to a new CSV for further analysis

output_path = '/mnt/data/Preprocessed_Twitter_Data.csv'
df.to_csv(output_path, index=False)
print(f"\nPreprocessed data saved to {output_path}")

# Exploratory Data Analysis

# Distribution of sentiment labels
print("\nSentiment label distribution:")
print(df['sentiment'].value_counts())
df['sentiment'].value_counts().plot(kind='bar', title='Sentiment Label Distribution', color=['green', 'blue', 'orange'])
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

# Most frequent words for each sentiment class
for sentiment in df['sentiment'].unique():
    sentiment_words = ' '.join(df[df['sentiment'] == sentiment]['cleaned_tweet'])
    word_counts = Counter(sentiment_words.split())
    print(f"\nMost common words for {sentiment} sentiment:")
    print(word_counts.most_common(10))

    # Generate and display a word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(sentiment_words)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for {sentiment} Sentiment')
    plt.axis('off')
    plt.show()

# Sentiment distribution over time (if timestamp information is available)
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    sentiment_over_time = df.groupby([df['timestamp'].dt.date, 'sentiment']).size().unstack()
    sentiment_over_time.plot(kind='line', figsize=(10, 5), title='Sentiment Distribution Over Time')
    plt.xlabel('Date')
    plt.ylabel('Count')
    plt.show()

# Feature Extraction using TF-IDF
print("\nExtracting features using TF-IDF...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['cleaned_tweet']).toarray()

# Labels (target variable)
print("\nPreparing labels...")
y = df['sentiment']

# Splitting data into training and testing sets
print("\nSplitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData split complete:")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

# Model Training using Naive Bayes
print("\nTraining Naive Bayes model...")
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Model Evaluation
print("\nEvaluating model...")
y_pred = nb_model.predict(X_test)

# Display evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Calculate and display accuracy, precision, recall, and F1 score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nAccuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
