In this project, we will be performing sentiment analysis of Twitter data. We will using the Twitter dataset to access tweets and use natural language processing techniques to classify tweets as positive, negative or neutral. The sentiment analysis model will be trained on a pre-labeled dataset of tweets and will use machine learning algorithms to classify new tweets.


LOGISTIC REGRESSION


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load the data from the CSV file
data = pd.read_csv('/content/labeled_data.csv')

# Split the data into training and testing sets
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)

# Create a CountVectorizer to convert the text data into numerical features
vectorizer = CountVectorizer(stop_words='english')
train_features = vectorizer.fit_transform(train_data['tweet'])
test_features = vectorizer.transform(test_data['tweet'])

# Train a logistic regression model on the training data
model = LogisticRegression(max_iter=1000)
model.fit(train_features, train_data['class'])

# Test the model on the testing data
predictions = model.predict(test_features)

# Calculate precision, recall, and F1 score for each class
report = classification_report(test_data['class'], predictions)
print(report)


              precision    recall  f1-score   support

           0       0.42      0.23      0.30       258
           1       0.94      0.96      0.95      3879
           2       0.87      0.92      0.89       820

    accuracy                           0.91      4957
   macro avg       0.74      0.70      0.71      4957
weighted avg       0.90      0.91      0.91      4957



NGRAMS

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load the data from the CSV file
data = pd.read_csv('/content/labeled_data.csv')

# Split the data into training and testing sets
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)

# Create a CountVectorizer to convert the text data into character n-grams
vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 6), stop_words='english')
train_features = vectorizer.fit_transform(train_data['tweet'])
test_features = vectorizer.transform(test_data['tweet'])

# Train a logistic regression model on the training data
model = LogisticRegression(max_iter=1000)
model.fit(train_features, train_data['class'])

# Test the model on the testing data
predictions = model.predict(test_features)

# Calculate precision, recall, and F1 score for each class
report = classification_report(test_data['class'], predictions)
print(report)




              precision    recall  f1-score   support

           0       0.43      0.23      0.30       258
           1       0.94      0.96      0.95      3879
           2       0.86      0.89      0.87       820

    accuracy                           0.91      4957
   macro avg       0.74      0.69      0.71      4957
weighted avg       0.90      0.91      0.90      4957



TF-IDF + SVM CLASSIFICATION

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Define the function to clean the text
def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word.lower() not in stop_words]
    # Join the words back into a string and return
    return ' '.join(words)

# Load the dataset
df = pd.read_csv('/content/labeled_data.csv', index_col=0)

# Clean the text data
df['clean_text'] = df['tweet'].apply(clean_text)

# Split the data into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(df['clean_text'], df['class'], test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
tfidf = TfidfVectorizer()

# Fit and transform the training data
train_features = tfidf.fit_transform(train_data)

# Transform the test data using the fitted vectorizer
test_features = tfidf.transform(test_data)

# Create an SVM classifier
svm = SVC(kernel='linear')

# Train the SVM classifier
svm.fit(train_features, train_labels)

# Test the SVM classifier
predictions = svm.predict(test_features)

# Evaluate the performance of the classifier
print(classification_report(test_labels, predictions))
print(confusion_matrix(test_labels, predictions))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


              precision    recall  f1-score   support

           0       0.57      0.20      0.30       290
           1       0.93      0.96      0.94      3832
           2       0.84      0.89      0.86       835

    accuracy                           0.90      4957
   macro avg       0.78      0.68      0.70      4957
weighted avg       0.89      0.90      0.89      4957

[[  58  204   28]
 [  39 3680  113]
 [   5   90  740]]
