<a href="https://colab.research.google.com/github/protocorn/sentiment-analysis-using-machine-learning-and-deep-learning-models/blob/main/Cross_Domain_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Loading The Dataset**

In [None]:
import pandas as pd
# Replace 'public_link' with your actual path to dataset
public_link = '/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv'
df = pd.read_csv(public_link, encoding='latin-1', header=None)

**Data Preprocessing**

In [None]:
#Step 1 : Handling Missing Values

# Drop rows with any missing values
df.dropna(inplace=True)

In [None]:
#Step 2: Data Cleaning
# Assuming the first column is the text column, convert it to lowercase and remove punctuation
import string
df[5] = df[5].str.lower()
df[5] = df[5].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
#Step 3: Tokenization
# Example: Tokenization
df[5] = df[5].apply(word_tokenize)

**Splitting the Dataset**

In [None]:
from sklearn.model_selection import train_test_split

# Drop rows with empty strings in the text column
df = df[df[5].map(bool)]

X = df[5]  # Selecting the text data from the sixth column
y = df[0]  # Selecting the target labels from the first column

# Convert target labels to binary classes (0 for negative, 4 for positive)
y = y.replace({2: 4})  # Neutral (2) is converted to positive (4)

# Separate X and y again after removing empty rows
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train = y_train.replace({4: 1})  # Convert positive (4) to 1
y_test = y_test.replace({4: 1})  # Convert positive (4) to 1

# **Training Machine Learning Models**

**TF-IDF Vectorization**

In [None]:
# Step 1: Text Vectorization (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform([' '.join(sample) for sample in X_train])
X_test_tfidf = tfidf_vectorizer.transform([' '.join(sample) for sample in X_test])

**Naive Bayes Classifier**

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Step 2: Model Training (Naive Bayes)
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train_tfidf, y_train)

# Step 3: Model Evaluation
y_pred_naive = naive_bayes_model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_naive)
print("Accuracy:", accuracy)

# Generate classification report
print(classification_report(y_test, y_pred_naive))

**Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train_tfidf, y_train)

# Make predictions and evaluate
y_pred_rf = random_forest_model.predict(X_test_tfidf)

y_pred_binary_rf = [1 if val > 0.5 else 0 for val in y_pred_rf]  # Convert probabilities to binary predictions

accuracy = accuracy_score(y_test, y_pred_binary_rf)
print("Random Forest Accuracy:", accuracy)

# Generate classification report
print(classification_report(y_test, y_pred_binary_rf))

**Logistic Regression**

In [None]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
# Create and train the Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train_tfidf, y_train)

# Make predictions and evaluate
y_pred_logistic = logreg_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred_logistic)
print("Logistic Regression Accuracy:", accuracy)
print(classification_report(y_test, y_pred_logistic))

**XGBoost**

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
# Convert data to DMatrix
dtrain = xgb.DMatrix(X_train_tfidf, label=y_train)
dtest = xgb.DMatrix(X_test_tfidf, label=y_test)

# Set hyperparameters
params = {
    'objective': 'binary:logistic',
    'max_depth': 6,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'logloss'
}

# Train the model
num_round = 100  # Number of boosting rounds (epochs)
bst = xgb.train(params, dtrain, num_round)

# Make predictions and evaluate
y_pred_xgb = bst.predict(dtest)
y_pred_binary_xgb = [1 if val > 0.5 else 0 for val in y_pred_xgb]
accuracy = accuracy_score(y_test, y_pred_binary_xgb)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred_binary_xgb))

# **Training Deep Learning Models**

**Tokenization and Sequence Padding**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts([' '.join(sample) for sample in X_train])
X_train_sequences = tokenizer.texts_to_sequences([' '.join(sample) for sample in X_train])
X_test_sequences = tokenizer.texts_to_sequences([' '.join(sample) for sample in X_test])

vocab_size = len(tokenizer.word_index) + 1

# Limit the sequence length (e.g., to 100)
max_sequence_length = 100

X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

**LSTM Model**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Step 2: Model Training (RNN)
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=max_sequence_length))
lstm_model.add(LSTM(100))
lstm_model.add(Dense(1, activation='sigmoid'))

# Use the RMSprop optimizer instead of Adam
lstm_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

# Step 3: Model Training (Reduced Epochs and Batch Size)
lstm_model.fit(X_train_padded, y_train, epochs=10, batch_size=32)

# Step 4: Model Evaluation (after additional epochs)
y_pred_probabilities_lstm = lstm_model.predict(X_test_padded).flatten()

# Assuming y_pred_probabilities contains the predicted probabilities
y_pred_lstm = [0 if prob <= 0.5 else 1 for prob in y_pred_probabilities_lstm]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_lstm)
print("Accuracy:", accuracy)

# Generate classification report
print(classification_report(y_test, y_pred_lstm, zero_division=1))

**BiLSTM Model**

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense

# Define the vocabulary size and max sequence length (adjust these to your data)
vocab_size = 10000
max_sequence_length = 100

# Create a BiLSTM model
bilstm_model = Sequential()
bilstm_model.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=max_sequence_length))
bilstm_model.add(Bidirectional(LSTM(100)))  # BiLSTM with 100 units
bilstm_model.add(Dense(1, activation='sigmoid'))

# Compile the BiLSTM model (use the same optimizer and loss as the LSTM model)
bilstm_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

# Now you can train and evaluate this BiLSTM model in a similar way as the LSTM model
# Training without validation split
num_epochs = 10
batch_size = 32
bilstm_model.fit(X_train_padded, y_train, epochs=num_epochs, batch_size=batch_size)

# Evaluation on the test set
y_pred_bilstm =bilstm_model.predict(X_test_padded)
y_pred_binary_bilstm = (y_pred_bilstm >= 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred_binary_bilstm)
print("Test Accuracy:", accuracy)
print(classification_report(y_test, y_pred_binary_bilstm))

**CNN Model**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score

# Prepare your text data (tokenization and padding) as before

# Build the CNN model
cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_sequence_length))
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dense(1, activation='sigmoid'))

# Compile the model
cnn_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

# Train the model
cnn_model.fit(X_train_padded, y_train, epochs=10, batch_size=32)

# Evaluate the model
y_pred_cnn = cnn_model.predict(X_test_padded)
y_pred_cnn = [1 if pred >= 0.5 else 0 for pred in y_pred_cnn]
accuracy = accuracy_score(y_test, y_pred_cnn)
print("CNN Model Accuracy:", accuracy)
print(classification_report(y_test, y_pred_cnn))

**DNN Model**

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

# Define your DNN model
dnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=max_sequence_length),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
dnn_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

# Training
num_epochs = 10
batch_size = 32
dnn_model.fit(X_train_padded, y_train, epochs=num_epochs, batch_size=batch_size)

# Evaluation
accuracy = dnn_model.evaluate(X_test_padded, y_test)[1]
print("Test Accuracy:", accuracy)

**Loading All the Trained Models**

In [None]:
import joblib
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import load_model

logistic_regression_model = joblib.load('/kaggle/input/logistic/logistic_regression.pkl')
bi_lstm_model =  load_model('/kaggle/input/bilstm/bilstm_model.h5')
lstm_model =  load_model('/kaggle/input/lstm-model/lstm_model.h5')
naive_bayes_model = joblib.load('/kaggle/input/naive-bayes/naive_bayes.pkl')
random_forest = joblib.load('/kaggle/input/random-forest-new/random_forest1.pkl')
cnn_model = load_model('/kaggle/input/neuralnetworks2/cnn_model.h5')
dnn_model = load_model('/kaggle/input/neuralnetworks2/dnn_model_2.h5')
import xgboost as xgb
dtest = xgb.DMatrix(X_test_tfidf, label=y_test)
xgboost_model =  xgb.Booster(model_file='/kaggle/input/xgboost/xgboost_sentiment_model.model')

**Converting the .txt file to .csv format**

In [None]:
import pandas as pd
import re

# Define the path to your input text file and output CSV file
input_file_path = '/kaggle/input/sentiment-labelled-sentences-data-set/sentiment labelled sentences/amazon_cells_labelled.txt'  # Replace with the path to your input text file
output_csv_path = '/kaggle/working/amazon_dataset.csv'  # Replace with the desired output CSV path

# Initialize empty lists to store sentences and labels
sentences = []
labels = []

# Read the input text file line by line
with open(input_file_path, 'r') as file:
    for line in file:
        # Use regular expression to extract sentence and label
        match = re.match(r'^(.*?)(\.|!|\?)\s+(\d)$', line)
        if match:
            sentence = match.group(1)
            label = match.group(3)
            sentences.append(sentence)
            labels.append(label)

# Create a DataFrame with two columns
df2 = pd.DataFrame({'Sentence': sentences, 'Label': labels})

# Write the DataFrame to a CSV file
df2.to_csv(output_csv_path, index=False)

print(f'CSV file saved to {output_csv_path}')

#repeat the same process with the other dataset

**Evaluating performance of Trained Models on Diverse Datasets**

In [None]:
#For Machine Learning Models:

def preprocess_text_with_negation_lexicon_ml(texts):
    # If a single text is provided, convert it to a list of one element
    if isinstance(texts, str):
        texts = [texts]

    result_texts = []
    for text in texts:
        # Convert to lowercase and remove punctuation
        processed_text = text.lower()
        processed_text = processed_text.translate(str.maketrans('', '', string.punctuation))
        # Tokenize the text
        tokens = word_tokenize(processed_text)
        # Handle negation using the get_antonym function
        result_tokens = []
        negate = False
        for word in tokens:
            if word in ["not", "n't", "no"]:
                negate = not negate
            else:
                if negate:
                    antonym = get_antonym_ml(word)
                    if antonym:
                        result_tokens.append(antonym)
                    else:
                        result_tokens.append("not_" + word)
                    negate = False
                else:
                    result_tokens.append(word)
        # Join the tokens back to a single string
        result_texts.append(" ".join(str(token) for token in result_tokens))

    return result_texts

def get_antonym_ml(word):
    antonyms = set()
    for synset in nltk.corpus.wordnet.synsets(word):
        for lemma in synset.lemmas():
            for antonym in lemma.antonyms():
                antonyms.add(antonym.name())
    return list(antonyms)

# Step 5: Define the predict_sentiment_with_negation_lexicon function for sentiment prediction
def predict_sentiment_with_negation_lexicon_ml(texts, model):
    # If a single text is provided, convert it to a list of one element
    if isinstance(texts, str):
        texts = [texts]

    result_texts = []
    for text in texts:
        processed_text = preprocess_text_with_negation_lexicon_ml(text)
        result_texts.append(" ".join(processed_text))  # Join the list of tokens back to a single string

    # Vectorize the preprocessed text using the pre-trained TF-IDF vectorizer
    import xgboost as xgb

    processed_text_tfidf = tfidf_vectorizer.transform(result_texts)

    # Convert the processed_text_tfidf to a DMatrix
    #d_test = xgb.DMatrix(processed_text_tfidf)
    #uncomment the above line only when working with XGBoost Model

    predicted_sentiments = model.predict(processed_text_tfidf) #use this for all models except for the XGBoost

    #predicted_sentiments = model.predict(d_test)
    #uncomment the above line only when working with XGBoost Model

    return predicted_sentiments

In [None]:
#For Deep Learning Models:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_text_with_negation_lexicon(texts):
    # If a single text is provided, convert it to a list of one element
    if isinstance(texts, str):
        texts = [texts]

    result_texts = []
    for text in texts:
        # Convert to lowercase and remove punctuation
        processed_text = text.lower()
        processed_text = processed_text.translate(str.maketrans('', '', string.punctuation))
        # Tokenize the text by splitting it into words
        tokens = processed_text.split()
        # Handle negation using the get_antonym function
        result_tokens = []
        negate = False
        for word in tokens:
            if word in ["not", "n't", "no"]:
                negate = not negate
            else:
                if negate:
                    antonym = get_antonym(word)
                    if antonym:
                        result_tokens.append(antonym[0])  # Append the first antonym
                    else:
                        result_tokens.append("not_" + word)
                    negate = False
                else:
                    result_tokens.append(word)
        # Join the tokens back to a single string
        result_texts.append(" ".join(result_tokens))

    return result_texts

def get_antonym(word):
    antonyms = []
    for synset in nltk.corpus.wordnet.synsets(word):
        for lemma in synset.lemmas():
            for antonym in lemma.antonyms():
                antonyms.append(antonym.name())
    return antonyms

def predict_sentiment_with_negation_lexicon(texts, model, tokenizer, max_sequence_length):
    if isinstance(texts, str):
        texts = [texts]

    predicted_sentiments = []
    for text in texts:
        # Preprocess text with negation handling and tokenization
        processed_text = preprocess_text_with_negation_lexicon(text)
        sequences = tokenizer.texts_to_sequences(processed_text)
        padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

        # Predict sentiment using the deep learning models
        predicted_sentiment = model.predict(padded_sequences)
        predicted_sentiments.append(predicted_sentiment[0])

    return predicted_sentiments

In [None]:
df2.iloc[:, 1] = df2.iloc[:, 1].astype(int)

In [None]:
new_texts_ml = df2.iloc[:, 0].tolist()
actual_labels = df2.iloc[:, 1].tolist()

**Prediction using Mahcine Learning Models**

In [None]:
ml_model = random_forest #you can replace ml_model with any machine learning models we have trained
predicted_sentiments_ml = predict_sentiment_with_negation_lexicon_ml(new_texts_ml, ml_model)

In [None]:
# Calculate accuracy by comparing predicted labels to actual labels
predicted_sentiments_ml = [1 if pred >= 0.5 else 0 for pred in predicted_sentiments_ml]
correct_predictions = sum(1 for predicted, actual in zip(predicted_sentiments_ml, actual_labels) if predicted == actual)
total_predictions = len(predicted_sentiments_ml)
accuracy = correct_predictions / total_predictions * 100
print(f"Accuracy on the new dataset: {accuracy:.2f}%")

**Prediction Using Deep Learning Models**

In [None]:
import numpy as np

# Split the data into batches
batch_size = 100  # Adjust the batch size as needed
num_batches = len(new_texts_ml) // batch_size

predicted_sentiments_dl = []  # List to store predicted sentiments

dl_model = dnn_model #replace the dnn_model with any deep learning model if required

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch_texts = new_texts_ml[start_idx:end_idx]

    # Call the function to get predicted sentiments for the batch
    predicted_batch_sentiments = predict_sentiment_with_negation_lexicon(
        batch_texts, dl_model, tokenizer, max_sequence_length
    )

    # Append the batch predictions to the list
    predicted_sentiments_dl.extend(predicted_batch_sentiments)

# Handle any remaining data that doesn't fit in a full batch
if len(new_texts_ml) % batch_size != 0:
    remaining_texts = new_texts_ml[num_batches * batch_size :]
    predicted_remaining_sentiments = predict_sentiment_with_negation_lexicon(
        remaining_texts, dl_model, tokenizer, max_sequence_length
    )
    predicted_sentiments_dl.extend(predicted_remaining_sentiments)

In [None]:
predicted_sentiments_dl = [1 if pred >= 0.5 else 0 for pred in predicted_sentiments_dl]
# Calculate accuracy by comparing predicted labels to actual labels
correct_predictions = sum(1 for predicted, actual in zip(predicted_sentiments_dl, actual_labels) if predicted == actual)
total_predictions = len(predicted_sentiments_dl)
accuracy = correct_predictions / total_predictions * 100

print(f"Accuracy on the new dataset: {accuracy:.2f}%")