<a href="https://colab.research.google.com/github/sona-gj/Sentiment-Analysis-of-Social-Media-Data/blob/final/SMA_Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This coed works better in Jupiter Notebook

In [None]:
pip install contractions

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import numpy as np
import contractions
import gensim.downloader as api
import tensorflow_hub as hub
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import chi2,SelectKBest
from sklearn.svm import LinearSVC
from pylab import barh, plot, yticks, show, grid, xlabel, figure
from sklearn.feature_selection import chi2
from nltk.stem import WordNetLemmatizer

In [None]:
# Download stopwords if not already done
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download("wordnet")
nltk.download("omw-1.4")

In [5]:
chi2_selector = SelectKBest(chi2, k=100)

In [None]:
word2vec_model = api.load('word2vec-google-news-300')

In [None]:
glove_model = api.load('glove-wiki-gigaword-100')

In [None]:
encoder = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", trainable=False)

In [None]:
#importing the dataset csv file

data = pd.read_csv("dataset.csv")
data.head(3)

In [None]:
#Removing unwanted columns

data = data.drop(columns = ["id","username","created_at","user followers count","replycount","retweetcount","likecount","quotecount",'media', 'retweetedTweet', 'quotedtweet',
       'inReplyToTweetId', 'inReplyToUser', 'mentionedUsers','hashtags','language'])
data.head(3)


In [None]:
#Basic Cleaning

def remove_unwanted(text):

    #expand the words
    text = contractions.fix(text)

    #convert the text to lowercase
    text = text.lower()

    #remove the urls
    text = re.sub(r'https?:\/\/\S*', '', text, flags=re.MULTILINE)

    # Remove mentions, special characters, and additional URLs
    text = ' '.join(re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\\w+:\/\/\S+)", " ", text).split())
    return text

#before cleaning
tweet_2582_before_cleaning = data.loc[2582, 'text']
print(tweet_2582_before_cleaning)

#clean the text
data['cleaned_text'] = data['text'].apply(remove_unwanted)

#after cleaning
tweet_2582_cleaned = data.loc[2582, 'cleaned_text']
print(tweet_2582_cleaned)

data.head(3)

In [None]:
#Basic Preprocessing

# Set stopwords & custom words
stopWords = set(stopwords.words('english'))
custom_words = {'stockmarketcrash','stockmarkets','bearmarket','stock','market'}

#Stemming - PorterStemmer, Lemmatizer - WordNet
ps = PorterStemmer()
# wnl = WordNetLemmatizer()

# Function to remove stopwords and custom words
def remove_unwanted(text):
    words = word_tokenize(text)
    wordsFiltered = [w for w in words if w not in stopWords and w not in custom_words]
    stemmed_words = [ps.stem(word) for word in wordsFiltered]
    return ' '.join(stemmed_words)
    # lemmatized_words = [wnl.lemmatize(word) for word in wordsFiltered]
    # return ' '.join(lemmatized_words)

data['cleaned_text'] = data['cleaned_text'].apply(remove_unwanted)
data.head(10)


In [None]:
#splitting the dataset into trainset and testset in the ratio- 80:20

x = data.drop(columns = ['text_sentiment'])
y = data.text_sentiment
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

#printing shapes of testing and training sets :

print("shape of original dataset :", data.shape)
print("shape of input - training set", x_train.shape)
print("shape of output - training set", y_train.shape)
print("shape of input - testing set", x_test.shape)
print("shape of output - testing set", y_test.shape)

In [None]:
#representing the trainset and testset pictorically

labels = ['Training Set', 'Testing Set']
sizes = [ x_train.shape[0], x_test.shape[0]]

plt.bar(labels, sizes, color=['skyblue', 'lightgreen'])
plt.title('Dataset Split: Training vs Testing')
plt.ylabel('Total Samples')

plt.show()

In [None]:
# Evaluate performance

def evaluate_performance(y_test,y_pred, embedding, classifier):
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='weighted')
  recall = recall_score(y_test, y_pred, average='weighted')
  f1 = f1_score(y_test, y_pred, average='weighted')

  print(f"Accuracy for the {classifier} using {embedding}: {accuracy:.4f}")
  print(f"Precision for the {classifier} using {embedding}: {precision:.4f}")
  print(f"Recall for the {classifier} using {embedding}: {recall:.4f}")
  print(f"F1-Score for the {classifier} using {embedding}: {f1:.4f}")

In [None]:
# SVM Classifier

def svm_classifier(X_train_model,X_test_model,y_train,y_test,embedding):
  clf = LinearSVC(max_iter=10000)
  clf.fit(X_train_model, y_train)
  y_pred = clf.predict(X_test_model)
  evaluate_performance(y_test,y_pred,embedding,'SVM')

In [None]:
# Logistic Regression classifier

def logistic_regression_classifier(X_train_model,X_test_model,y_train,y_test,embedding):
  classifier = LogisticRegression(max_iter=1000)
  classifier.fit(X_train_model, y_train)

  # Make predictions on the test set
  y_pred = classifier.predict(X_test_model)
  evaluate_performance(y_test,y_pred,embedding,'Logistic Regression')

In [None]:
#plot chi-sqaure

def plot_chi_sqaure(chi2score,embedding):
  print("Chi2 for "+ embedding)
  # Assuming vectorizer and chi2score are already computed
  wscores = list(zip(vectorizer.get_feature_names_out(), chi2score))  # list to allow sorting multiple times
  wchi2 = sorted(wscores, key=lambda x: x[1])  # Sort by chi-square score

  # Extract the top 25 features based on chi-square scores
  topchi2 = list(zip(*wchi2[-25:]))  # Unzips the sorted list

  # Extract labels and scores
  labels = topchi2[0]  # Feature names
  scores = topchi2[1]  # Chi-square scores

  # Plotting
  x = np.arange(len(scores))  # x-axis is just the indices

  figure(figsize=(6, 6))
  barh(x, scores, align='center', alpha=.2, color='g')  # Horizontal bar chart
  plot(scores, x, '-o', markersize=2, alpha=.8, color='g')  # Plot line with markers
  yticks(x, labels)  # Add labels to the y-ticks
  xlabel('$\\chi^2$')  # X-axis label with escaped backslash
    # X-axis label
  grid(True)  # Add grid for better readability
  show()  # Show the plot


In [None]:
#chi-sqaure

def chi_sqaure(X_train_chi, X_test_chi, y_train, y_test,embedding):

  # computing chi2 for each feature
  chi2score = chi2(X_train_chi,y_train)[0]
  plot_chi_sqaure(chi2score,embedding)
  print("Chi2 scores for "+ embedding+": ", chi2score)

  # Apply chi-square feature selection to the training data
  X_train_selected = chi2_selector.fit_transform(X_train_chi, y_train)

  # Apply the same feature selection to the test data
  X_test_selected = chi2_selector.transform(X_test_chi)

  #training
  svm_classifier(X_train_selected,X_test_selected,y_train,y_test,embedding)
  logistic_regression_classifier(X_train_selected,X_test_selected,y_train,y_test,embedding)

In [None]:
#Bag of words

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(x_train['cleaned_text'])
X_train_array = X_train.toarray()

X_test = vectorizer.transform(x_test['cleaned_text'])
X_test_array = X_test.toarray()

chi_sqaure(X_train_array, X_test_array, y_train, y_test,'BagOfWords')

print(X_train_array.shape)
print(X_test_array.shape)

In [None]:
#TF-IDF

vectorizer = TfidfVectorizer()
X_train_tf_idf = vectorizer.fit_transform(x_train['cleaned_text'])
X_test_tf_idf = vectorizer.transform(x_test['cleaned_text'])

chi_sqaure(X_train_tf_idf, X_test_tf_idf, y_train, y_test,'TF-IDF')

print(X_train_tf_idf.shape)
print(X_test_tf_idf.shape)

In [None]:
# Function to average word vectors for a document

def document_to_vector(text, model, num_features):
    words = word_tokenize(text)
    feature_vector = np.zeros((num_features,), dtype='float32')
    n_words = 0

    # Only include words that are in the Word2Vec vocabulary
    for word in words:
        if word in model.key_to_index:
            n_words += 1
            feature_vector = np.add(feature_vector, model[word])

    if n_words > 0:
        feature_vector = np.divide(feature_vector, n_words)

    return feature_vector

In [None]:
#word2vec training


num_features = word2vec_model.vector_size


X_train_word2vec = np.array([document_to_vector(text, word2vec_model, num_features) for text in x_train['cleaned_text']])
X_test_word2vec = np.array([document_to_vector(text, word2vec_model, num_features) for text in x_test['cleaned_text']])


#training
svm_classifier(X_train_word2vec,X_test_word2vec,y_train,y_test,'Word2Vec')
logistic_regression_classifier(X_train_word2vec,X_test_word2vec,y_train,y_test,'Word2Vec')

# Check the shape of the resulting feature matrices
print(X_train_word2vec.shape)
print(X_test_word2vec.shape)


In [None]:
# glove training


num_features = glove_model.vector_size


X_train_glove = np.array([document_to_vector(text, glove_model, num_features) for text in x_train['cleaned_text']])
X_test_glove = np.array([document_to_vector(text, glove_model, num_features) for text in x_test['cleaned_text']])

#training
svm_classifier(X_train_glove,X_test_glove,y_train,y_test,'GloVe')
logistic_regression_classifier(X_train_glove,X_test_glove,y_train,y_test,'GloVe')

# Check the shape of the resulting feature matrices
print(X_train_glove.shape)
print(X_test_glove.shape)

In [None]:
#USE


X_train_use = encoder(tf.constant(x_train['cleaned_text'].to_list()))  # Convert pandas dataframe to list and then to tf.constant
X_test_use = encoder(tf.constant(x_test['cleaned_text'].to_list()))


X_train_use = X_train_use.numpy()
X_test_use = X_test_use.numpy()

#training
svm_classifier(X_train_use,X_test_use,y_train,y_test,'USE')
logistic_regression_classifier(X_train_use,X_test_use,y_train,y_test,'USE')

print(X_train_use.shape)
print(X_test_use.shape)
