In [None]:
# import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')


In [None]:
# Load dataset
df = pd.read_csv('sentiment140.csv', encoding='ISO-8859-1', header=None)
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

# Display the first few rows of the dataset
print(df.head())


In [None]:
# Data Cleaning
# Map sentiment labels
df['target'] = df['target'].map({0: 0, 4: 1})  # 0 = negative, 1 = positive

# Display the distribution of sentiments
print(df['target'].value_counts())

# Drop unnecessary columns
df = df.drop(columns=['id', 'date', 'flag', 'user'])


In [None]:
# Basic Statistics and Visualizations
# Plot the distribution of sentiments
plt.figure(figsize=(6, 4))
sns.countplot(x='target', data=df)
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.xticks([0, 1], ['Negative', 'Positive'])
plt.show()


In [None]:
# Sample Tweets Visualization
# Display a few sample tweets
print(df['text'].sample(10))


In [None]:
# Text Cleaning Function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove @ and #
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text_tokens = word_tokenize(text)  # Tokenize text
    filtered_words = [word for word in text_tokens if word not in stopwords.words('english')]  # Remove stopwords
    return " ".join(filtered_words)

# Apply the cleaning function to the text column
df['clean_text'] = df['text'].apply(clean_text)

# Display a few cleaned sample tweets
print(df['clean_text'].sample(10))


In [None]:
# Split Data into Training and Testing Sets
X = df['clean_text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display shapes of the training and testing sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


In [None]:
# Vectorize Text Data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Display the shape of the transformed data
print(X_train_vec.shape, X_test_vec.shape)


In [None]:
# Train Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Display training completion message
print("Model training completed.")


In [None]:
# Evaluate Model Performance
# Predict on test data
y_pred = model.predict(X_test_vec)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print(classification_report(y_test, y_pred))

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# Sample Predictions
# Sample tweets and their predicted sentiments
sample_tweets = X_test.sample(10)
sample_predictions = model.predict(vectorizer.transform(sample_tweets))

# Display sample predictions
for tweet, sentiment in zip(sample_tweets, sample_predictions):
    print(f"Tweet: {tweet}\nPredicted Sentiment: {'Positive' if sentiment == 1 else 'Negative'}\n")


In [None]:
# Word Cloud Visualization
from wordcloud import WordCloud

# Generate word cloud for positive tweets
positive_tweets = df[df['target'] == 1]['clean_text'].str.cat(sep=' ')
wordcloud_positive = WordCloud(width=800, height=400, max_font_size=100, collocations=False).generate(positive_tweets)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.title('Word Cloud for Positive Tweets')
plt.axis('off')
plt.show()

# Generate word cloud for negative tweets
negative_tweets = df[df['target'] == 0]['clean_text'].str.cat(sep=' ')
wordcloud_negative = WordCloud(width=800, height=400, max_font_size=100, collocations=False).generate(negative_tweets)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_negative, interpolation='bilinear')
plt.title('Word Cloud for Negative Tweets')
plt.axis('off')
plt.show()
