In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load your COVID-19 tweets dataset with labeled sentiment
# Replace 'your_dataset.csv' with the actual dataset file path
data = pd.read_csv('/content/finalSentimentdata2 - Copy - Copy.csv')

In [4]:
data.columns

Index(['number', 'sentiment', 'text'], dtype='object')

In [5]:
data.head()

Unnamed: 0,number,sentiment,text
0,3204,sad,agree the poor in india are treated badly thei...
1,1431,joy,if only i could have spent the with this cutie...
2,654,joy,will nature conservation remain a priority in ...
3,2530,sad,coronavirus disappearing in italy show this to...
4,2296,sad,uk records lowest daily virus death toll since...


In [6]:
data.tail()

Unnamed: 0,number,sentiment,text
3085,2579,sad,today at 02 30pm a 54 year old bangladeshi mal...
3086,3579,anger,corona virus i implore that you cease activity...
3087,221,joy,issa date once lockdown ends inshaallah (and c...
3088,2705,sad,the death toll due to covid 19 rose to 31 in j...
3089,2962,sad,the rates are become barrier for poor people t...


In [7]:
data.describe()

Unnamed: 0,number
count,3090.0
mean,2689.072816
std,1438.624297
min,3.0
25%,1368.25
50%,3030.5
75%,3949.75
max,4722.0


In [8]:
# Preprocessing (cleaning, tokenization, etc.) can be performed here

# Split the dataset into training, validation, and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train = tfidf_vectorizer.fit_transform(train_data['text'])
X_val = tfidf_vectorizer.transform(val_data['text'])
X_test = tfidf_vectorizer.transform(test_data['text'])

In [9]:
# Define labels
y_train = train_data['sentiment']
y_val = val_data['sentiment']
y_test = test_data['sentiment']

# Train a logistic regression classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

In [10]:
# Predict on the validation set
val_preds = classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, val_preds)
print(f'Validation Accuracy: {val_accuracy:.2f}')

# Predict on the test set
test_preds = classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, test_preds)
print(f'Test Accuracy: {test_accuracy:.2f}')

# You can also print a classification report for more detailed metrics
print(classification_report(y_test, test_preds))

Validation Accuracy: 0.64
Test Accuracy: 0.65
              precision    recall  f1-score   support

       anger       0.57      0.54      0.56       156
        fear       0.59      0.57      0.58       164
         joy       0.71      0.72      0.71       144
         sad       0.72      0.76      0.74       154

    accuracy                           0.65       618
   macro avg       0.65      0.65      0.65       618
weighted avg       0.64      0.65      0.65       618



In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [12]:
models = {
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    # Add more models here
}


In [13]:
results = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Make predictions
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    results[model_name] = accuracy  # Store accuracy in results dictionary

    print(f'{model_name} Accuracy: {accuracy:.2f}')

# Optionally, you can print or plot the results for comparison
print(results)


Random Forest Accuracy: 0.60
Gradient Boosting Accuracy: 0.63
SVM Accuracy: 0.65
{'Random Forest': 0.5954692556634305, 'Gradient Boosting': 0.6294498381877023, 'SVM': 0.6488673139158576}


In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load your dataset
# Replace 'your_dataset.csv' with your dataset file path
data = pd.read_csv('/content/finalSentimentdata2 - Copy - Copy.csv')

# Split the dataset into features (text) and labels (sentiment)
text_data = data['text']
labels = data['sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(text_data, labels, test_size=0.2, random_state=42)

# TF-IDF Vectorization (You can use other methods as well)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
}

rf_classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_tfidf, y_train)

# Get the best model from the grid search
best_rf_classifier = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_rf_classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.62


In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif

# Load your dataset (replace 'your_dataset.csv' with your dataset path)
data = pd.read_csv('/content/finalSentimentdata2 - Copy - Copy.csv')

In [15]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
import warnings

# Load your dataset
data = pd.read_csv('/content/finalSentimentdata2 - Copy - Copy.csv')  # Make sure to replace 'your_dataset.csv' with your actual file path

# Replace missing values with the mean of the 'number' column
imputer = SimpleImputer(strategy='mean')
data['number'] = imputer.fit_transform(data[['number']])

# Outlier Detection
# We'll use Isolation Forest as an example for outlier detection on 'number' column
outlier_detector = IsolationForest(contamination=0.05)
warnings.filterwarnings("ignore", category=UserWarning)  # Suppress the warning
outliers = outlier_detector.fit_predict(data[['number']])
data['is_outlier'] = outliers

# Feature Engineering
# Create a new feature by scaling the 'number' column
scaler = StandardScaler()
data['scaled_number'] = scaler.fit_transform(data[['number']])

# Feature Selection
# Create a new feature using SelectKBest for feature selection
feature_selector = SelectKBest(score_func=f_classif, k='all')  # Set k='all'
selected_features = feature_selector.fit_transform(data[['number']], data['sentiment'])

# Add the selected features to the DataFrame
data['selected_feature'] = selected_features  # There's only one selected feature in this case

# Save the cleaned and transformed


In [16]:
from textblob import TextBlob

# Function to perform sentiment analysis
def analyze_sentiment(text):
    analysis = TextBlob(text)

    # Check the sentiment polarity
    if analysis.sentiment.polarity > 0:
        return "Positive"
    elif analysis.sentiment.polarity < 0:
        return "Negative"
    else:
        return "Neutral"

# Main program
if __name__ == "__main__":
    while True:
        # Take user input
        user_input = input("Enter text (or 'exit' to quit): ")

        # Check if the user wants to exit
        if user_input.lower() == 'exit':
            break

        # Perform sentiment analysis
        sentiment = analyze_sentiment(user_input)
        print(f"Sentiment: {sentiment}")


Enter text (or 'exit' to quit): sad
Sentiment: Negative
Enter text (or 'exit' to quit): happy
Sentiment: Positive
Enter text (or 'exit' to quit): neutral
Sentiment: Neutral
Enter text (or 'exit' to quit): exit


In [17]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download VADER lexicon (run only once)
nltk.download("vader_lexicon")

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define a function for sentiment analysis
def analyze_sentiment(input_text):
    # Get sentiment scores
    sentiment_scores = analyzer.polarity_scores(input_text)

    # Determine sentiment based on compound score
    compound_score = sentiment_scores["compound"]
    sentiment = "Positive" if compound_score >= 0.05 else "Negative" if compound_score <= -0.05 else "Neutral"

    return sentiment


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [18]:
# Take user input and perform sentiment analysis
user_input = input("Enter a sentence to analyze sentiment: ")
sentiment_result = analyze_sentiment(user_input)

print(f"Sentiment Analysis Result: {sentiment_result}")


Enter a sentence to analyze sentiment: sad
Sentiment Analysis Result: Negative


In [19]:
# Convert text to lowercase
data['text'] = data['text'].str.lower()

# Remove special characters and punctuation
data['text'] = data['text'].str.replace('[^\w\s]', '')


  data['text'] = data['text'].str.replace('[^\w\s]', '')


In [20]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

# Remove stopwords
stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Stemming
stemmer = PorterStemmer()
data['text'] = data['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['text'])


In [22]:
# Example: Calculate text length
data['text_length'] = data['text'].apply(lambda x: len(x))

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
}

# Initialize and perform grid search
rf_classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, n_jobs=-1)
grid_search.fit(tfidf_matrix, data['sentiment'])


In [24]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train a Random Forest classifier (or other ensemble method)
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42)
rf_classifier.fit(tfidf_matrix, data['sentiment'])


In [25]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train a Random Forest classifier (or other ensemble method)
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42)
rf_classifier.fit(tfidf_matrix, data['sentiment'])


#Here we got **89 percent accuracy**

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions
predictions = rf_classifier.predict(tfidf_matrix)

# Calculate evaluation metrics
accuracy = accuracy_score(data['sentiment'], predictions)
precision = precision_score(data['sentiment'], predictions, average='weighted')
recall = recall_score(data['sentiment'], predictions, average='weighted')
f1 = f1_score(data['sentiment'], predictions, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.89
Precision: 0.89
Recall: 0.89
F1 Score: 0.89


In [27]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Download VADER lexicon (run only once)
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [32]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Download NLTK data
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Load and preprocess your dataset
data = pd.read_csv('/content/finalSentimentdata2 - Copy - Copy.csv', encoding='latin-1')  # Replace with your dataset path

# Ensure your dataset is clean and well-preprocessed
# Remove stopwords
stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Stemming
stemmer = PorterStemmer()
data['text'] = data['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Split the dataset into features (text) and labels (sentiment)
text_data = data['text']
labels = data['sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(text_data, labels, test_size=0.2, random_state=42)

# Feature Engineering
# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Define and evaluate other classifiers
def evaluate_classifier(classifier, X_train, y_train, X_test, y_test):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)

    return accuracy, precision, recall, f1, conf_matrix

# Initialize and evaluate Decision Tree
dt_classifier = DecisionTreeClassifier(random_state=42)
accuracy_dt, precision_dt, recall_dt, f1_dt, conf_matrix_dt = evaluate_classifier(dt_classifier, X_train_tfidf, y_train, X_test_tfidf, y_test)

# Initialize and evaluate Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
accuracy_rf, precision_rf, recall_rf, f1_rf, conf_matrix_rf = evaluate_classifier(rf_classifier, X_train_tfidf, y_train, X_test_tfidf, y_test)

# Initialize and evaluate Naive Bayes
nb_classifier = MultinomialNB()
accuracy_nb, precision_nb, recall_nb, f1_nb, conf_matrix_nb = evaluate_classifier(nb_classifier, X_train_tfidf, y_train, X_test_tfidf, y_test)

# Initialize and evaluate Support Vector Machine (SVM)
svm_classifier = SVC(kernel='linear', random_state=42)
accuracy_svm, precision_svm, recall_svm, f1_svm, conf_matrix_svm = evaluate_classifier(svm_classifier, X_train_tfidf, y_train, X_test_tfidf, y_test)

# Initialize and evaluate Logistic Regression
lr_classifier = LogisticRegression(max_iter=1000, random_state=42)
accuracy_lr, precision_lr, recall_lr, f1_lr, conf_matrix_lr = evaluate_classifier(lr_classifier, X_train_tfidf, y_train, X_test_tfidf, y_test)

# Print evaluation metrics for each classifier
print("Metrics for Other Classifiers:")
print(f"Decision Tree - Accuracy: {accuracy_dt:.2f}, Precision: {precision_dt:.2f}, Recall: {recall_dt:.2f}, F1 Score: {f1_dt:.2f}")
print(f"Random Forest - Accuracy: {accuracy_rf:.2f}, Precision: {precision_rf:.2f}, Recall: {recall_rf:.2f}, F1 Score: {f1_rf:.2f}")
print(f"Naive Bayes - Accuracy: {accuracy_nb:.2f}, Precision: {precision_nb:.2f}, Recall: {recall_nb:.2f}, F1 Score: {f1_nb:.2f}")
print(f"SVM - Accuracy: {accuracy_svm:.2f}, Precision: {precision_svm:.2f}, Recall: {recall_svm:.2f}, F1 Score: {f1_svm:.2f}")
print(f"Logistic Regression - Accuracy: {accuracy_lr:.2f}, Precision: {precision_lr:.2f}, Recall: {recall_lr:.2f}, F1 Score: {f1_lr:.2f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Metrics for Other Classifiers:
Decision Tree - Accuracy: 0.52, Precision: 0.52, Recall: 0.52, F1 Score: 0.52
Random Forest - Accuracy: 0.63, Precision: 0.63, Recall: 0.63, F1 Score: 0.62
Naive Bayes - Accuracy: 0.68, Precision: 0.69, Recall: 0.68, F1 Score: 0.68
SVM - Accuracy: 0.69, Precision: 0.69, Recall: 0.69, F1 Score: 0.69
Logistic Regression - Accuracy: 0.70, Precision: 0.70, Recall: 0.70, F1 Score: 0.70
