In [6]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/sohail/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import tensorflow as tf


gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, False)
            tf.config.experimental.set_virtual_device_configuration(
                gpu,
                [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])  # 4GB memory
    except RuntimeError as e:
        print(e)


2024-11-06 16:06:31.826497: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730891191.841030   11468 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730891191.845452   11468 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 16:06:31.859815: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
training_data = pd.read_csv('/home/sohail/emotion/training.csv')
validation_data = pd.read_csv('/home/sohail/emotion/validation.csv')
testing_data = pd.read_csv('/home/sohail/emotion/test.csv')

In [8]:
def preprocess_text(text):
    """
    Preprocesses a single text input using simple tokenization
    """
    
    text = str(text)
    
   
    text = text.lower()
    
    
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    
    text = ' '.join(text.split())
    
   
    tokens = text.split()
    
    return text, tokens


In [9]:
def process_dataset(df, text_column='text', label_column='label', dataset_name=''):
    """
    Process a single dataset and return statistics
    """
    print(f"\n=== Processing {dataset_name} ===")
    print(f"Initial shape: {df.shape}")
    
   
    initial_size = len(df)
    df = df.dropna(subset=[text_column, label_column])
    print(f"Rows with missing values removed: {initial_size - len(df)}")
    
    
    df['cleaned_text'], df['tokens'] = zip(*df[text_column].apply(preprocess_text))
    
    # Calculate statistics
    avg_token_length = df['tokens'].apply(len).mean()
    print(f"Average tokens per text: {avg_token_length:.2f}")
    
    # Show label distribution
    print("\nLabel distribution:")
    label_dist = df[label_column].value_counts().sort_index()
    total = len(df)
    
    # Updated emotion mapping
    emotion_map = {
        0: 'sadness',
        1: 'joy',
        2: 'love',
        3: 'anger',
        4: 'fear',
        5: 'surprise'  # Added new emotion
    }
    
    for label, count in label_dist.items():
        percentage = (count/total) * 100
        emotion = emotion_map.get(label, f'unknown_{label}')  # Safely handle any unexpected labels
        print(f"{emotion} ({label}): {count} ({percentage:.1f}%)")
    
    # Print unique labels for verification
    print("\nUnique labels in dataset:", sorted(df[label_column].unique()))
    
    return df

In [10]:
processed_training = process_dataset(training_data, dataset_name='Training Data')
processed_validation = process_dataset(validation_data, dataset_name='Validation Data')
processed_testing = process_dataset(testing_data, dataset_name='Testing Data')

# Show sample of processed data
print("\n=== Sample of processed training data ===")
print(processed_training[['cleaned_text', 'tokens']].head(2))


=== Processing Training Data ===
Initial shape: (16000, 2)
Rows with missing values removed: 0
Average tokens per text: 19.17

Label distribution:
sadness (0): 4666 (29.2%)
joy (1): 5362 (33.5%)
love (2): 1304 (8.2%)
anger (3): 2159 (13.5%)
fear (4): 1937 (12.1%)
surprise (5): 572 (3.6%)

Unique labels in dataset: [0, 1, 2, 3, 4, 5]

=== Processing Validation Data ===
Initial shape: (2000, 2)
Rows with missing values removed: 0
Average tokens per text: 18.87

Label distribution:
sadness (0): 550 (27.5%)
joy (1): 704 (35.2%)
love (2): 178 (8.9%)
anger (3): 275 (13.8%)
fear (4): 212 (10.6%)
surprise (5): 81 (4.0%)

Unique labels in dataset: [0, 1, 2, 3, 4, 5]

=== Processing Testing Data ===
Initial shape: (2000, 2)
Rows with missing values removed: 0
Average tokens per text: 19.15

Label distribution:
sadness (0): 581 (29.0%)
joy (1): 695 (34.8%)
love (2): 159 (8.0%)
anger (3): 275 (13.8%)
fear (4): 224 (11.2%)
surprise (5): 66 (3.3%)

Unique labels in dataset: [0, 1, 2, 3, 4, 5]

=== 

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [12]:
class TextFeatureEngineering:
    def __init__(self, max_features=5000, max_length=1000):
        self.max_features = max_features
        self.max_length = max_length
        self.tfidf = TfidfVectorizer(
            max_features=max_features,
            ngram_range=(1, 2),  # Unigrams and bigrams
            stop_words='english'
        )
        self.tokenizer = Tokenizer(num_words=max_features)
        
    def create_tfidf_features(self, train_texts, val_texts=None, test_texts=None):
        """
        Create TF-IDF features for text data
        """
        print("Creating TF-IDF features...")
        # Fit and transform training data
        X_train_tfidf = self.tfidf.fit_transform(train_texts)
        
        # Transform validation and test if provided
        X_val_tfidf = self.tfidf.transform(val_texts) if val_texts is not None else None
        X_test_tfidf = self.tfidf.transform(test_texts) if test_texts is not None else None
        
        # Get feature names for analysis
        feature_names = self.tfidf.get_feature_names_out()
        print(f"Number of TF-IDF features: {len(feature_names)}")
        
        return {
            'train': X_train_tfidf,
            'val': X_val_tfidf,
            'test': X_test_tfidf,
            'feature_names': feature_names
        }
    
    def create_sequence_features(self, train_texts, val_texts=None, test_texts=None):
        """
        Create sequence features for deep learning models
        """
        print("Creating sequence features...")
        # Fit tokenizer on training data
        self.tokenizer.fit_on_texts(train_texts)
        
        # Convert texts to sequences
        X_train_seq = self.tokenizer.texts_to_sequences(train_texts)
        X_train_pad = pad_sequences(X_train_seq, maxlen=self.max_length, padding='post')
        
        # Process validation and test if provided
        X_val_pad = None
        if val_texts is not None:
            X_val_seq = self.tokenizer.texts_to_sequences(val_texts)
            X_val_pad = pad_sequences(X_val_seq, maxlen=self.max_length, padding='post')
            
        X_test_pad = None
        if test_texts is not None:
            X_test_seq = self.tokenizer.texts_to_sequences(test_texts)
            X_test_pad = pad_sequences(X_test_seq, maxlen=self.max_length, padding='post')
        
        print(f"Vocabulary size: {len(self.tokenizer.word_index) + 1}")
        print(f"Sequence length: {self.max_length}")
        
        return {
            'train': X_train_pad,
            'val': X_val_pad,
            'test': X_test_pad,
            'tokenizer': self.tokenizer
        }
    
    def analyze_tfidf_features(self, tfidf_features, labels):
        """
        Analyze the most important features for each emotion
        """
        print("\nAnalyzing important features for each emotion...")
        feature_names = tfidf_features['feature_names']
        X_train_tfidf = tfidf_features['train']
        
        for emotion in sorted(set(labels)):
            # Get indices for this emotion
            emotion_indices = labels == emotion
            
            # Calculate average TF-IDF scores for this emotion
            emotion_scores = X_train_tfidf[emotion_indices].mean(axis=0).A1
            
            # Get top features
            top_indices = emotion_scores.argsort()[-10:][::-1]
            top_features = [(feature_names[i], emotion_scores[i]) for i in top_indices]
            
            print(f"\nTop features for emotion {emotion}:")
            for feature, score in top_features:
                print(f"{feature}: {score:.4f}")

def create_features(processed_training, processed_validation, processed_testing):
    """
    Create features for all datasets
    """
    # Initialize feature engineering
    feature_eng = TextFeatureEngineering(max_features=5000, max_length=100)
    
    print("Processing features for training set of shape:", processed_training.shape)
    
    # Create TF-IDF features
    tfidf_features = feature_eng.create_tfidf_features(
        processed_training['cleaned_text'],
        processed_validation['cleaned_text'],
        processed_testing['cleaned_text']
    )
    
    # Create sequence features for deep learning
    sequence_features = feature_eng.create_sequence_features(
        processed_training['cleaned_text'],
        processed_validation['cleaned_text'],
        processed_testing['cleaned_text']
    )
    
    # Analyze features
    feature_eng.analyze_tfidf_features(tfidf_features, processed_training['label'])
    
    return tfidf_features, sequence_features

In [13]:
tfidf_features, sequence_features = create_features(processed_training, processed_validation, processed_testing)

Processing features for training set of shape: (16000, 4)
Creating TF-IDF features...
Number of TF-IDF features: 5000
Creating sequence features...
Vocabulary size: 15213
Sequence length: 100

Analyzing important features for each emotion...

Top features for emotion 0:
feel: 0.0608
feeling: 0.0408
like: 0.0276
im: 0.0236
feel like: 0.0208
just: 0.0161
really: 0.0119
im feeling: 0.0118
know: 0.0116
ive: 0.0098

Top features for emotion 1:
feel: 0.0611
feeling: 0.0363
like: 0.0282
im: 0.0242
feel like: 0.0214
im feeling: 0.0142
just: 0.0136
really: 0.0114
time: 0.0109
pretty: 0.0097

Top features for emotion 2:
feel: 0.0581
feeling: 0.0354
like: 0.0348
feel like: 0.0265
im: 0.0235
loving: 0.0185
love: 0.0170
caring: 0.0168
sweet: 0.0160
sympathetic: 0.0159

Top features for emotion 3:
feel: 0.0569
feeling: 0.0414
like: 0.0273
im: 0.0265
feel like: 0.0221
just: 0.0179
im feeling: 0.0146
angry: 0.0130
irritable: 0.0126
really: 0.0123

Top features for emotion 4:
feel: 0.0523
feeling: 0.04

In [14]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import Adam

In [15]:
max_length = 59  
embedding_dim = 200  
num_classes = 6
max_features = 1000  


In [20]:
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefficients = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefficients
    return embeddings_index

In [19]:
def create_embedding_matrix(tokenizer, embeddings_index, embedding_dim):
    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [18]:
def build_lstm_model(embedding_matrix, max_length, num_classes):
    model = Sequential()
    # Embedding Layer
    model.add(Embedding(input_dim=embedding_matrix.shape[0],
                        output_dim=embedding_matrix.shape[1],
                        embeddings_initializer=Constant(embedding_matrix),
                        input_length=max_length,
                        trainable=False))  # Set trainable=True if you want to fine-tune
    
    # LSTM Layer 1
    model.add(LSTM(512, return_sequences=True,))  # Increased units to 256
    
    # LSTM Layer 2
    model.add(LSTM(256, return_sequences=False,))  # Increased units to 128
    
    # Dense Layer
    model.add(Dense(128, activation='relu'))  # Increased units to 128
    model.add(Dropout(0.3))
    model.add(Dense(64,activation='relu'))
    # Output Layer
    model.add(Dense(num_classes, activation='softmax'))
    
    # Compile the model
    optimizer = Adam(learning_rate=0.0001)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    
    return model


In [16]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [21]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(processed_training['cleaned_text'])
X_train_seq = tokenizer.texts_to_sequences(processed_training['cleaned_text'])
X_val_seq = tokenizer.texts_to_sequences(processed_validation['cleaned_text'])
X_test_seq = tokenizer.texts_to_sequences(processed_testing['cleaned_text'])
glove_file = '/home/sohail/glove/glove.6B.200d.txt'

# Pad the sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')
embedding_index = load_glove_embeddings(glove_file)  # Update path
embedding_matrix = create_embedding_matrix(tokenizer, embedding_index, embedding_dim)

In [23]:
import pickle

# After fitting the tokenizer on your training data
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [22]:
# Find the maximum sequence length in training, validation, and test sets
max_seq_train = max(len(seq) for seq in X_train_seq)
max_seq_val = max(len(seq) for seq in X_val_seq)
max_seq_test = max(len(seq) for seq in X_test_seq)

# Overall maximum sequence length across all datasets
overall_max_seq = max(max_seq_train, max_seq_val, max_seq_test)

# Print the results
print(f"Max sequence length in training set: {max_seq_train}")
print(f"Max sequence length in validation set: {max_seq_val}")
print(f"Max sequence length in test set: {max_seq_test}")
print(f"Overall max sequence length: {overall_max_seq}")


Max sequence length in training set: 59
Max sequence length in validation set: 56
Max sequence length in test set: 54
Overall max sequence length: 59


In [20]:
# Find the maximum sequence length in each set
train_max_length = max(len(seq) for seq in X_train_seq)
val_max_length = max(len(seq) for seq in X_val_seq)
test_max_length = max(len(seq) for seq in X_test_seq)

# Determine the overall maximum sequence length
max_length = max(train_max_length, val_max_length, test_max_length)

print(f"Max sequence length in training set: {train_max_length}")
print(f"Max sequence length in validation set: {val_max_length}")
print(f"Max sequence length in test set: {test_max_length}")
print(f"Overall max sequence length: {max_length}")


Max sequence length in training set: 59
Max sequence length in validation set: 56
Max sequence length in test set: 54
Overall max sequence length: 59


In [21]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Extract labels from the datasets
y_train = processed_training['label']
y_val = processed_validation['label']
y_test = processed_testing['label']

# Convert labels to numerical values
label_encoder = LabelEncoder()

# Fit the label encoder on the training data and transform all sets
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

# Convert the numerical labels to one-hot encoding
y_train = to_categorical(y_train, num_classes=6)  # Assuming 6 emotion classes
y_val = to_categorical(y_val, num_classes=6)
y_test = to_categorical(y_test, num_classes=6)


In [22]:
model = build_lstm_model(embedding_matrix, max_length, num_classes)


I0000 00:00:1730891321.856978   11468 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4096 MB memory:  -> device: 0, name: Quadro T2000, pci bus id: 0000:01:00.0, compute capability: 7.5


In [23]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1


In [25]:
history = model.fit(X_train_pad, y_train, 
                    epochs=20, 
                    batch_size=64, 
                    validation_data=(X_val_pad, y_val),
                    )

Epoch 1/20


I0000 00:00:1730891405.332728   11947 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 63ms/step - accuracy: 0.3064 - loss: 1.6467 - val_accuracy: 0.3520 - val_loss: 1.5825
Epoch 2/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 63ms/step - accuracy: 0.3188 - loss: 1.5835 - val_accuracy: 0.3710 - val_loss: 1.5694
Epoch 3/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 63ms/step - accuracy: 0.3568 - loss: 1.5640 - val_accuracy: 0.4325 - val_loss: 1.4310
Epoch 4/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 63ms/step - accuracy: 0.4801 - loss: 1.3954 - val_accuracy: 0.5150 - val_loss: 1.3092
Epoch 5/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 63ms/step - accuracy: 0.5231 - loss: 1.2831 - val_accuracy: 0.5540 - val_loss: 1.1996
Epoch 6/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 62ms/step - accuracy: 0.5462 - loss: 1.2088 - val_accuracy: 0.5765 - val_loss: 1.1185
Epoch 7/20
[1m250/250[0m 

In [26]:
model.save('sentiment_anaylisis.keras')

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import pickle

# Load your trained model
model = load_model('sentiment_anaylisis.keras')

# Load the tokenizer from file
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Define the sentiment labels
emotion_labels = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}

def predict_sentiment(user_input, tokenizer, max_length):
    # Preprocess input (tokenizing and padding)
    input_seq = tokenizer.texts_to_sequences([user_input])
    input_pad = pad_sequences(input_seq, maxlen=max_length, padding='post')

    # Make a prediction
    prediction = model.predict(input_pad)
    
    # Get the sentiment with the highest probability
    predicted_label = np.argmax(prediction, axis=1)[0]
    
    # Map the predicted label to the sentiment
    predicted_sentiment = emotion_labels[predicted_label]
    
    return predicted_sentiment

# Example usage


I0000 00:00:1730963724.176510    1795 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted sentiment: joy


In [None]:
user_input = ""
predicted_sentiment = predict_sentiment(user_input, tokenizer, max_length=63)
print(f"Predicted sentiment: {predicted_sentiment}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 396ms/step
Predicted sentiment: anger
