# Pranav Srinivas Venkatesh - 1678255
# Thejas Thirthalingaiah - 1678000
# Bharath Kumar Nagaraju - 1604533




# Importing library for data augmentation

In [1]:
!pip install nlpaug



Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


# Importing necessary libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization, Input, concatenate
from tensorflow.keras.callbacks import EarlyStopping
import nlpaug.augmenter.word as naw

# Mounting google colab

In [3]:

from google.colab import drive

# from google.colab import drive
drive.mount('/content/drive')
df=pd.read_csv('/content/drive/MyDrive/train.tsv',on_bad_lines='warn',sep='\t')

Mounted at /content/drive


In [4]:
df['id'].head(10
              )

0    1613916569775513600
1    1610293430143574021
2    1615763835658305536
3    1617231518937538563
4    1621892119861366788
5    1608840424080412674
6    1616711994198966273
7    1599052076742766592
8    1627422178818793472
9    1611777912998612992
Name: id, dtype: int64

# Using Convolution Neural Network with ensemble method

In [5]:
#embeddings_index=50

In [10]:

# Split the dataset into training and testing sets
df_train, df_test = train_test_split(df, test_size=0.30, random_state=42)

# Feature engineering using TF-IDF on the training set
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features_train = tfidf_vectorizer.fit_transform(df_train['text']).toarray()

# Tokenize and pad sequences for training set
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['text'])
vocab_size = len(tokenizer.word_index) + 1
sequences_train = tokenizer.texts_to_sequences(df_train['text'])
max_length = max(len(seq) for seq in sequences_train)
X_train = pad_sequences(sequences_train, maxlen=max_length)

# Convert labels to one-hot encoding
y_train = pd.get_dummies(df_train['sentiment']).values

# Feature engineering using TF-IDF on the testing set
tfidf_features_test = tfidf_vectorizer.transform(df_test['text']).toarray()

# Tokenize and pad sequences for testing set
sequences_test = tokenizer.texts_to_sequences(df_test['text'])
X_test = pad_sequences(sequences_test, maxlen=max_length)

# Convert labels to one-hot encoding for testing set
y_test = pd.get_dummies(df_test['sentiment']).values

embeddings_index = {}

embedding_dim = 50
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Build the CNN model with improvements and feature engineering
cnn_model = Sequential()
cnn_model.add(Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False))
cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(BatchNormalization())
cnn_model.add(Dense(3, activation='softmax'))

# Feature engineering with TF-IDF
tfidf_input = Input(shape=(5000,))
dense_tfidf = Dense(128, activation='relu')(tfidf_input)

# Combine CNN and TF-IDF features
combined = concatenate([cnn_model.output, dense_tfidf])
final_output = Dense(3, activation='softmax')(combined)

ensemble_model_base = Model(inputs=[cnn_model.input, tfidf_input], outputs=final_output)
ensemble_model_base.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Implement early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Text augmentation using nlpaug
aug = naw.SynonymAug()

# Augment the training data
X_train_augmented = []
y_train_augmented = []
tfidf_train_augmented = []

for text, label, tfidf, tweet_id in zip(df_train['text'], df_train['sentiment'], tfidf_features_train, df_train['id']):
    augmented_text = aug.augment(text)
    X_train_augmented.append({'id': tweet_id, 'text': augmented_text})
    y_train_augmented.append(label)
    tfidf_train_augmented.append(tfidf)

# Tokenize and pad augmented sequences
sequences_augmented = tokenizer.texts_to_sequences([example['text'] for example in X_train_augmented])
X_train_augmented = pad_sequences(sequences_augmented, maxlen=max_length)

# Convert labels to one-hot encoding
y_train_augmented = pd.get_dummies(y_train_augmented).values
tfidf_train_augmented = np.array(tfidf_train_augmented)

# Train the ensemble model with augmented data
ensemble_model_base.fit([X_train_augmented, tfidf_train_augmented], y_train_augmented,
                   epochs=20, batch_size=32, validation_split=0.1, callbacks=[early_stopping])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


<keras.src.callbacks.History at 0x79a95570c280>

In [11]:
from sklearn.metrics import f1_score

# Make predictions on the test set
y_pred_prob = ensemble_model_base.predict([X_test, tfidf_features_test])

# Convert predicted probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# Convert one-hot encoded true labels to class labels
y_true = np.argmax(y_test, axis=1)

# Calculate F1 score
f1 = f1_score(y_true, y_pred, average='weighted')

print("Weighted F1 Score on Test Set:", f1)


Weighted F1 Score on Test Set: 0.5874195045575207


In [12]:
accuracy_base = ensemble_model_base.evaluate([X_test, tfidf_features_test], y_test, verbose=0)[1]
accuracy_base

0.6407407522201538

#Grid Search

In [13]:
from sklearn.model_selection import ParameterGrid
from keras.regularizers import l1, l2

best_f1_score = 0.0
best_params_f1 = None


# Define a function to create the CNN model
def create_model(activation='relu', units=128, kernel_regularizer=None, activity_regularizer=None):
    cnn_model = Sequential()
    cnn_model.add(Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False))
    cnn_model.add(Conv1D(units, 5, activation=activation, kernel_regularizer=kernel_regularizer, activity_regularizer=activity_regularizer))
    cnn_model.add(GlobalMaxPooling1D())
    cnn_model.add(Dropout(0.5))
    cnn_model.add(Dense(64, activation='relu'))
    cnn_model.add(BatchNormalization())
    cnn_model.add(Dense(3, activation='softmax'))

    tfidf_input = Input(shape=(5000,))
    dense_tfidf = Dense(128, activation='relu')(tfidf_input)

    combined = concatenate([cnn_model.output, dense_tfidf])
    final_output = Dense(3, activation='softmax')(combined)

    model = Model(inputs=[cnn_model.input, tfidf_input], outputs=final_output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    return model

# Define the hyperparameter grid with regularizers
param_grid = {
    'activation': ['relu', 'sigmoid', 'tanh'],
    'units': [64, 128, 256],
    'kernel_regularizer': [None, l1(0.01), l2(0.01)],
    'activity_regularizer': [None, l1(0.01), l2(0.01)]
}

# Implement early stopping with verbose set to 0
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=0)

# Iterate over all combinations of hyperparameters
for params in ParameterGrid(param_grid):
    model = create_model(
        activation=params['activation'],
        units=params['units'],
        kernel_regularizer=params['kernel_regularizer'],
        activity_regularizer=params['activity_regularizer']
    )
    model.fit([X_train_augmented, tfidf_train_augmented], y_train_augmented,
              epochs=20, batch_size=32, validation_split=0.1, callbacks=[early_stopping], verbose=0)

    # Evaluate the model on F1 score
    y_pred = model.predict([X_test, tfidf_features_test])
    f1 = f1_score(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1), average='weighted')

    # Check if the current model has a better F1 score than the previous best
    if f1 > best_f1_score:
        best_f1_score = f1
        best_params_f1 = params

    # Print or store results as needed

# Print the best parameters and corresponding F1 score
print("Best parameters (F1 score):", best_params_f1)
print("Best F1 score:", best_f1_score)

Best parameters (F1 score): {'activation': 'relu', 'activity_regularizer': <keras.src.regularizers.L2 object at 0x79a9379c2710>, 'kernel_regularizer': <keras.src.regularizers.L1 object at 0x79a937e97cd0>, 'units': 64}
Best F1 score: 0.6313085193491775


In [14]:

for key, value in best_params_f1.items():
    print(f'{key}: {value}')

activation: relu
activity_regularizer: <keras.src.regularizers.L2 object at 0x79a9379c2710>
kernel_regularizer: <keras.src.regularizers.L1 object at 0x79a937e97cd0>
units: 64


In [15]:
# Use the trained model to predict labels for the test split
predictions = model.predict([X_test, tfidf_features_test])

# Convert the predicted probabilities to class labels
predicted_labels = np.argmax(predictions, axis=1)


# Display the predicted labels
df_test['predicted_sentiment'] = predicted_labels
df_test




Unnamed: 0,id,text,sentiment,predicted_sentiment
1339,1603023236916281345,.@greenpeace_de heute mit genau der richtigen ...,2,2
1222,1618740286947209216,"Klimakrise: CO2 sehr viel Industrie, Produkthe...",0,2
1106,1602626145664507906,"Aktuell gibt es einen #Stromausfall in #Newel,...",0,0
812,1616429760757768192,"Gratulation, liebe @AMPELKOALITI0N und @CDU\nD...",2,2
1231,1618970856478347264,Wie lange wollen Sie die #Armut in #Deutschlan...,2,2
...,...,...,...,...
555,1611162553568165888,Die Regierung müsste komplett zurücktreten! Ba...,2,2
2118,1633840218301411328,Wann werden der #Leopard2 und andere dringend ...,1,0
1506,1626555694945120257,"Die gute Nachricht, einen Krapfen oder auch ...",2,2
1419,1619712269038030848,"Auf den Ausspruch ""Kohle ist eine Sünde"" habe ...",2,2


# Taking holdback set for testing purpose

In [16]:
df_new=pd.read_csv('/content/drive/MyDrive/holdback.tsv',on_bad_lines='warn',sep='\t')

In [None]:
df_new

In [17]:

df_new1=df_new
# Tokenize and pad sequences for the new dataset
sequences_new = tokenizer.texts_to_sequences(df_new1['text'])
X_new = pad_sequences(sequences_new, maxlen=max_length)

# Predict sentiments for the new dataset
predictions_new = model.predict([X_new, tfidf_vectorizer.transform(df_new['text']).toarray()])

# Add predicted sentiments to the DataFrame
df_new1['predicted_sentiment'] = np.argmax(predictions_new, axis=1)

# Display the relevant columns
df_new1



Unnamed: 0,id,text,predicted_sentiment
0,1631570395445751811,Die symbolträchtige Kleinstadt #Bachmut bleibt...,0
1,1595013102936133632,"Um den Klimawandel zu bremsen, ist ein Umstieg...",1
2,1599000892891488258,".@pascalmeiser: ""Die Ampel hat in der Mietenpo...",2
3,1586994950738268160,Also wenn wir tiefe Löcher in die Erde bohren ...,1
4,1618139099453624321,Die Zukunft und Sicherheit Europas ist in höch...,2
...,...,...,...
295,1618222773855473666,Man fühlt sich im falschen #Film: Kollektiver ...,0
296,1618373016093855753,Deutschland hat die Bestrafung verdient! Die T...,2
297,1634868404078415873,"Ich wußte spätestens, aber wirklich allerspäte...",2
298,1617436568125415425,"Was fällt euch auf, in dem Moment, wo der Herr...",2


# Writing the predictions of sentiments along with Tweet IDs to TSV file

In [26]:


# Assuming you have the tweet IDs in the 'tweet_id' column of your new dataset
df_results = pd.DataFrame({'id': df_new['id'], 'sentiment': df_new1['predicted_sentiment']})

# Save the results to a TSV file
output_file_path = '/content/drive/MyDrive/predicted_sentiments_holdback_new.tsv'
df_results.to_csv(output_file_path, sep='\t', index=False)

print(f"Predicted sentiments saved to {output_file_path}")


Predicted sentiments saved to /content/drive/MyDrive/predicted_sentiments_holdback_new.tsv


In [None]:
import os
print("Current Working Directory:", os.getcwd())
