In [8]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout


# Base class for models
class MLModel():
    def __init__(self):
        pass

    def train(self, X, y):
        raise NotImplementedError

    def predict(self, X):
        raise NotImplementedError

    def evaluate(self, X, y):
        raise NotImplementedError

# Feature-based model (LogisticRegression with PCA for dimensionality reduction)
class FeatureModel(MLModel):
    def __init__(self):
        self.pca = PCA(n_components=100)  # PCA with 100 components
        self.model = LogisticRegression(max_iter=100)  # Logistic Regression Model
        self.num_parameters = 0  # Placeholder for the number of model parameters

    def train(self, X, y):
        flattened_features = X.reshape(X.shape[0], -1)
        reduced_features = self.pca.fit_transform(flattened_features)
        self.model.fit(reduced_features, y)
        self.num_parameters = self.model.coef_.size + self.model.intercept_.size
        print(f"Number of parameters in the Logistic Regression model: {self.num_parameters}")

    def predict(self, X):
        flattened_features = X.reshape(X.shape[0], -1)
        reduced_features = self.pca.transform(flattened_features)
        return self.model.predict(reduced_features)

    def evaluate(self, X, y):
        y_pred = self.predict(X)
        accuracy = accuracy_score(y, y_pred)
        print(f"Accuracy: {accuracy * 100:.2f}%")
        print("Classification Report:")
        print(classification_report(y, y_pred))

# Emoticon-based model (using LogisticRegression with One-Hot Encoding)
class EmoticonModel(MLModel):
    def __init__(self):
        self.encoder = OneHotEncoder(handle_unknown='ignore')
        self.model = LogisticRegression(max_iter=500, penalty='l1', solver='liblinear')
        self.best_model = None
        self.grid_search = None

    def train(self, X, y, train_data_fraction=1.0):
        max_length = 13
        X_emojis = pd.DataFrame(X.apply(self.split_emojis).tolist(), columns=[f'emoji_{i+1}' for i in range(max_length)])
        X_encoded = self.encoder.fit_transform(X_emojis)
        if train_data_fraction < 1.0:
            X_train_partial, _, y_train_partial, _ = train_test_split(X_encoded, y, train_size=train_data_fraction, random_state=42)
        else:
            X_train_partial, y_train_partial = X_encoded, y
        param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
        self.grid_search = GridSearchCV(self.model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
        self.grid_search.fit(X_train_partial, y_train_partial)
        self.best_model = self.grid_search.best_estimator_

    def predict(self, X):
        max_length = 13
        X_emojis = pd.DataFrame(X.apply(self.split_emojis).tolist(), columns=[f'emoji_{i+1}' for i in range(max_length)])
        X_encoded = self.encoder.transform(X_emojis)
        return self.best_model.predict(X_encoded)

    def split_emojis(self, emoji_string):
        max_length = 13
        emojis = list(emoji_string)
        return emojis + [''] * (max_length - len(emojis)) if len(emojis) < max_length else emojis

    def evaluate(self, X, y):
        y_pred = self.predict(X)
        accuracy = accuracy_score(y, y_pred)
        print(f"Accuracy: {accuracy * 100:.2f}%")
        print("Classification Report:")
        print(classification_report(y, y_pred))

# Text sequence-based model (using LSTM for sequence classification)
class TextSeqModel(MLModel):
    def __init__(self):
        self.tokenizer = Tokenizer(char_level=True)
        self.max_length = 47
        self.model = None

    def train(self, X, y, X_valid, y_valid):
        X = X.apply(lambda x: x[3:])
        X_valid = X_valid.apply(lambda x: x[3:])
        self.tokenizer.fit_on_texts(X)
        X_train_tokenized = self.tokenizer.texts_to_sequences(X)
        X_val_tokenized = self.tokenizer.texts_to_sequences(X_valid)
        X_train_padded = pad_sequences(X_train_tokenized, maxlen=self.max_length, padding='post')
        X_val_padded = pad_sequences(X_val_tokenized, maxlen=self.max_length, padding='post')
        vocab_size = len(self.tokenizer.word_index) + 1
        embedding_dim = 16
        self.model = Sequential()
        self.model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=self.max_length))
        self.model.add(LSTM(32, return_sequences=False))
        self.model.add(Dropout(0.3))
        self.model.add(Dense(1, activation='sigmoid'))
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        self.model.fit(X_train_padded, y, epochs=150, batch_size=32, validation_data=(X_val_padded, y_valid))

    def predict(self, X):
        X = X.apply(lambda x: x[3:])
        X_tokenized = self.tokenizer.texts_to_sequences(X)
        X_padded = pad_sequences(X_tokenized, maxlen=self.max_length, padding='post')
        return (self.model.predict(X_padded) > 0.5).astype("int32").flatten()

    def evaluate(self, X, y):
        y_pred = self.predict(X)
        accuracy = accuracy_score(y, y_pred)
        print(f"Accuracy: {accuracy * 100:.2f}%")
        print("Classification Report:")
        print(classification_report(y, y_pred))

# One-hot encoding function
def onehot_encode(train_data, test_data, val_data):
    # Fit on the training data and transform both train, validation, and test sets
    train_encoded = onehot_encoder.fit_transform(np.array(train_data).reshape(-1, 1)).toarray()
    test_encoded = onehot_encoder.transform(np.array(test_data).reshape(-1, 1)).toarray()
    val_encoded = onehot_encoder.transform(np.array(val_data).reshape(-1, 1)).toarray()
    return train_encoded, test_encoded, val_encoded
# Utility function to save predictions to a file
def save_predictions_to_file(predictions, filename):
    with open(filename, 'w') as f:
        for pred in predictions:
            f.write(f"{pred}\n")


# Utility function to save predictions to a text file
def save_predictions_to_text(predictions, filename):
    with open(filename, 'w') as f:
        for pred in predictions:
            f.write(f"{pred}\n")

if __name__ == '__main__':
    # Load the datasets
    train_emoticon_df = pd.read_csv("datasets/train/train_emoticon.csv")
    val_emoticon_df = pd.read_csv("datasets/valid/valid_emoticon.csv")
    test_emoticon_df = pd.read_csv("datasets/test/test_emoticon.csv")
    
    train_seq_df = pd.read_csv("datasets/train/train_text_seq.csv")
    val_seq_df = pd.read_csv("datasets/valid/valid_text_seq.csv")
    test_seq_df = pd.read_csv("datasets/test/test_text_seq.csv")
    
    train_feat = np.load("datasets/train/train_feature.npz", allow_pickle=True)
    val_feat = np.load("datasets/valid/valid_feature.npz", allow_pickle=True)
    test_feat = np.load("datasets/test/test_feature.npz", allow_pickle=True)
    
    # Prepare input data
    train_emoticon_X = train_emoticon_df['input_emoticon'].tolist()
    val_emoticon_X = val_emoticon_df['input_emoticon'].tolist()
    test_emoticon_X = test_emoticon_df['input_emoticon'].tolist()
    
    train_seq_X = train_seq_df['input_str'].tolist()
    val_seq_X = val_seq_df['input_str'].tolist()
    test_seq_X = test_seq_df['input_str'].tolist()
    
    train_emoticon_Y = train_emoticon_df['label'].tolist()
    val_emoticon_Y = val_emoticon_df['label'].tolist()
    train_feat_X = train_feat['features']
    train_feat_Y = train_feat['label']
    val_feat_X = val_feat['features']
    val_feat_Y = val_feat['label']
    
    test_feat_X = test_feat['features']
    
    # Initialize encoders and scalers
    scaler = StandardScaler()
    onehot_encoder = OneHotEncoder(handle_unknown='ignore')
    
    # One-Hot Encode Emoticon Dataset (Training, Validation, and Test)
    train_emoticon_encoded, test_emoticon_encoded, val_emoticon_encoded = onehot_encode(train_emoticon_X, test_emoticon_X, val_emoticon_X)
    train_seq_encoded, test_seq_encoded, val_seq_encoded = onehot_encode(train_seq_X, test_seq_X, val_seq_X)
    
    # Scale the feature matrices
    train_feat_scaled = scaler.fit_transform(train_feat_X.reshape(train_feat_X.shape[0], -1))
    val_feat_scaled = scaler.transform(val_feat_X.reshape(val_feat_X.shape[0], -1))
    test_feat_scaled = scaler.transform(test_feat_X.reshape(test_feat_X.shape[0], -1))

    # Concatenate all encoded/processed datasets
    train_X_combined = np.hstack((train_emoticon_encoded, train_seq_encoded, train_feat_scaled))
    val_X_combined = np.hstack((val_emoticon_encoded, val_seq_encoded, val_feat_scaled))
    test_X_combined = np.hstack((test_emoticon_encoded, test_seq_encoded, test_feat_scaled))
    
    # Convert labels to numpy arrays
    train_Y_combined = np.array(train_emoticon_Y)
    val_Y_combined = np.array(val_emoticon_Y)
    
    # List of classifiers to test
    classifiers = {
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "XGBoost": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42),
        "Logistic Regression": LogisticRegression(max_iter=200, solver='liblinear', random_state=42)
    }
    
    # Open a text file to write the predictions
    with open("pred_combined.txt", "w") as f:
        # Train each classifier and evaluate accuracy
        for name, model in classifiers.items():
            # Fit the model on training data
            model.fit(train_X_combined, train_Y_combined)
            
            # Make predictions on the validation data
            val_pred = model.predict(val_X_combined)
            
            # Calculate and print accuracy
            val_accuracy = accuracy_score(val_Y_combined, val_pred)
            print(f"Validation Accuracy for {name}: {val_accuracy}")
            
            # Write validation accuracy to the file
            f.write(f"Validation Accuracy for {name}: {val_accuracy}\n")
    
            # Make predictions on the test data
            test_pred = model.predict(test_X_combined)
            
            # Write the predictions to the file
            f.write(f"Predictions for {name} on Test Data:\n")
            for pred in test_pred:
                f.write(f"{pred}\n")
            f.write("\n")
    
    print("Test predictions saved to pred_combined.txt")

    # Load training dataset for FeatureModel
    train_data = np.load('datasets/train/train_feature.npz')
    train_features = train_data['features']  # Shape: (7080, 13, 768)
    train_labels = train_data['label']  # Shape: (7080,)

    # Initialize and train FeatureModel
    feature_model = FeatureModel()
    feature_model.train(train_features, train_labels)

    # Evaluate the model on the validation set
    valid_data = np.load('datasets/valid/valid_feature.npz')
    valid_features = valid_data['features']  # Shape: (validation_samples, 13, 768)
    valid_labels = valid_data['label']  # Validation labels
    feature_model.evaluate(valid_features, valid_labels)

    # Load test dataset for FeatureModel
    test_data = np.load('datasets/test/test_feature.npz')
    test_features = test_data['features']  # Test features
    test_predictions_feature = feature_model.predict(test_features)

    # Save predictions of FeatureModel to a text file
    save_predictions_to_text(test_predictions_feature, 'pred_deepfeat.txt')

    # Load training dataset for EmoticonModel
    train_emoticon_data = pd.read_csv('datasets/train/train_emoticon.csv')
    valid_emoticon_data = pd.read_csv('datasets/valid/valid_emoticon.csv')
    test_emoticon_data = pd.read_csv('datasets/test/test_emoticon.csv')

    # Target labels for emoticon model
    y_train_emoticon = train_emoticon_data['label'].values
    y_valid_emoticon = valid_emoticon_data['label'].values

    # Create the EmoticonModel instance
    emoticon_model = EmoticonModel()
    emoticon_model.train(train_emoticon_data['input_emoticon'], y_train_emoticon)
    emoticon_model.evaluate(valid_emoticon_data['input_emoticon'], y_valid_emoticon)

    # Make predictions on the test data for EmoticonModel
    test_predictions_emoticon = emoticon_model.predict(test_emoticon_data['input_emoticon'])

    # Save predictions of EmoticonModel to a text file
    save_predictions_to_text(test_predictions_emoticon, 'pred_emoticon.txt')

    # Load the text sequence dataset for TextSeqModel
    train_seq_data = pd.read_csv('datasets/train/train_text_seq.csv')
    valid_seq_data = pd.read_csv('datasets/valid/valid_text_seq.csv')
    test_seq_data = pd.read_csv('datasets/test/test_text_seq.csv')

    # Target labels for text sequence model
    y_train_seq = train_seq_data['label'].values
    y_valid_seq = valid_seq_data['label'].values

    # Create the TextSeqModel instance
    text_model = TextSeqModel()
    text_model.train(train_seq_data['input_str'], y_train_seq, valid_seq_data['input_str'], y_valid_seq)
    text_model.evaluate(valid_seq_data['input_str'], y_valid_seq)

    # Make predictions on the test data for TextSeqModel
    test_predictions_seq = text_model.predict(test_seq_data['input_str'])

    # Save predictions of TextSeqModel to a text file
    save_predictions_to_text(test_predictions_seq, 'pred_textseq.txt')

    print("Test predictions have been saved to pred_textseq.txt files.")


Validation Accuracy for Random Forest: 0.9795501022494888


Parameters: { "use_label_encoder" } are not used.



Validation Accuracy for XGBoost: 0.983640081799591
Validation Accuracy for Logistic Regression: 0.9815950920245399
Test predictions saved to pred_combined.txt
Number of parameters in the Logistic Regression model: 101
Accuracy: 98.57%
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       252
           1       0.98      0.99      0.99       237

    accuracy                           0.99       489
   macro avg       0.99      0.99      0.99       489
weighted avg       0.99      0.99      0.99       489

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Accuracy: 92.84%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       252
           1       0.92      0.93      0.93       237

    accuracy                           0.93       489
   macro avg       0.93      0.93      0.93       489
weighted avg       0.93      0.93      0.93



Epoch 1/150
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 21ms/step - accuracy: 0.5232 - loss: 0.6919 - val_accuracy: 0.5910 - val_loss: 0.6752
Epoch 2/150
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.5957 - loss: 0.6652 - val_accuracy: 0.6360 - val_loss: 0.6439
Epoch 3/150
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.6041 - loss: 0.6522 - val_accuracy: 0.6360 - val_loss: 0.6430
Epoch 4/150
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.6106 - loss: 0.6516 - val_accuracy: 0.6299 - val_loss: 0.6408
Epoch 5/150
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - accuracy: 0.6144 - loss: 0.6411 - val_accuracy: 0.6196 - val_loss: 0.6462
Epoch 6/150
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.6311 - loss: 0.6440 - val_accuracy: 0.6360 - val_loss: 0.6416
Epoch 7/150
[1