In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
import os
import pandas as pd

# Folder path containing your CSV files
folder_path = "/content/drive/MyDrive/TextClassification/Preproccessing/AfterAssigningTheThreeEmotionLabels"

# Initialize an empty list to store all processed data
all_data = []

# Initialize counter for rows in 'TEXT' column
text_row_count = 0

# Preprocessing function to clean text (same as in second approach)
def preprocess_text(text):
    # Lowercase text
    text = text.lower()
    # Remove punctuation, numbers, etc.
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    return text

# Loop over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)

        # Load the dataset
        df = pd.read_csv(file_path)

        # Process each row in the current file
        for index, row in df.iterrows():
            # Assuming 'TEXT' column contains tokenized sentences (as list-like format)
            sentences = eval(row['TEXT']) if pd.notna(row['TEXT']) else []
            label = row['EMOTION']

            # For each sentence, preprocess and append to all_data
            for sentence in sentences:
                # Only count non-empty sentences in the 'TEXT' column
                if sentence.strip():  # Check if the sentence is not empty
                    text_row_count += 1  # Increment the counter

                # Apply text preprocessing
                processed_sentence = preprocess_text(sentence)
                all_data.append({'TEXT': processed_sentence, 'EMOTION': label})

# Convert the processed data into a DataFrame
final_df = pd.DataFrame(all_data)

# Output the total count of rows in the 'TEXT' column across all files
print(f"Total number of rows in 'TEXT' column across all files: {text_row_count}")


Total number of rows in 'TEXT' column across all files: 8805


In [None]:
import os
import pandas as pd

# Folder path containing your CSV files
folder_path = "/content/drive/MyDrive/TextClassification/Preproccessing/AfterAssigningTheThreeEmotion"

# Initialize an empty list to store all processed data
all_data = []

# Preprocessing function to clean text
def preprocess_text(text):
    # Lowercase text
    text = text.lower()
    # Remove punctuation, numbers, etc.
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    return text

# Loop over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)

        # Load the dataset
        df = pd.read_csv(file_path)

        # Process each row in the current file
        for index, row in df.iterrows():
            # Assuming 'TEXT' column contains a single word, not a list
            word = row['TEXT'] if pd.notna(row['TEXT']) else ""
            label = row['EMOTION']

            # Apply text preprocessing
            processed_word = preprocess_text(word)
            all_data.append({'TEXT': processed_word, 'EMOTION': label})

# Convert the processed data into a DataFrame
final_df = pd.DataFrame(all_data)

# Print the total number of rows processed and the final DataFrame's size
print(f"Total rows processed across all files: {len(all_data)}")
print(f"Total rows in final_df: {len(final_df)}")

Total rows processed across all files: 1592
Total rows in final_df: 1592


In [None]:
!pip install scikit-learn



In [21]:
# Prepare data for training (Text and Labels)
X = final_df['TEXT']  # Use a different variable name to avoid overwriting
y = final_df['EMOTION']

In [2]:
# Preprocess the text data
def preprocess_text(text):
    # Lowercase text
    text = text.lower()
    # Remove punctuation, numbers, etc.
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    return text

# Apply preprocessing to the training text
# X = X.apply(preprocess_text)

In [23]:
# Feature extraction using Bag of Words (BoW)
from sklearn.feature_extraction.text import CountVectorizer # Import CountVectorizer from sklearn

# Feature extraction using Bag of Words (BoW)
bow_vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_bow = bow_vectorizer.fit_transform(X)
num_features_bow = len(bow_vectorizer.get_feature_names_out())
print(f"Number of unique features (terms) using Bag of Words: {num_features_bow}")

Number of unique features (terms) using Bag of Words: 6489


In [24]:
# Feature extraction using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer # Import TfidfVectorizer from sklearn

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(X)  # Use a different variable name
num_features_tfidf = len(tfidf_vectorizer.get_feature_names_out())
print(f"Number of unique features (terms) using TF-IDF: {num_features_tfidf}")

Number of unique features (terms) using TF-IDF: 6489


In [25]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split # Import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [26]:
# Import the necessary class from scikit-learn
from sklearn.naive_bayes import MultinomialNB

# Train Naive Bayes model
nb_model = MultinomialNB()
# Now use X_train_tfidf, which is aligned with y_train
nb_model.fit(X_tfidf, y)

In [27]:
# Evaluate Naive Bayes model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # Import necessary functions

# Evaluate Naive Bayes model
# Use X_test_tfidf for prediction, as the model is trained on TF-IDF features
y_pred = nb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7837

Classification Report:
              precision    recall  f1-score   support

         neg       0.69      0.99      0.81       146
         neu       0.97      0.52      0.68        75
         pos       0.97      0.67      0.80        98

    accuracy                           0.78       319
   macro avg       0.88      0.73      0.76       319
weighted avg       0.84      0.78      0.78       319


Confusion Matrix:
[[145   0   1]
 [ 35  39   1]
 [ 31   1  66]]


In [28]:
# Apply GridSearchCV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV # Import GridSearchCV

# Hyperparameter tuning using GridSearchCV
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0], 'fit_prior': [True, False]}
grid_search = GridSearchCV(nb_model, param_grid, cv=5, n_jobs=-1, verbose=1)
# Fit using the transformed training data (X_train_tfidf)
grid_search.fit(X_tfidf, y)  # Changed line to use X_train_tfidf
print("Best parameters found: ", grid_search.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters found:  {'alpha': 0.5, 'fit_prior': True}


In [29]:
# Use the best model from grid search for final evaluation
best_model = grid_search.best_estimator_

# Predict using the best model - X_test is already transformed
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Accuracy with best parameters: {accuracy_best:.4f}")
print("\nClassification Report for Best Model:")
print(classification_report(y_test, y_pred_best))
print("\nConfusion Matrix for Best Model:")
print(confusion_matrix(y_test, y_pred_best))

Accuracy with best parameters: 0.8621

Classification Report for Best Model:
              precision    recall  f1-score   support

         neg       0.79      0.99      0.88       146
         neu       0.98      0.72      0.83        75
         pos       0.95      0.79      0.86        98

    accuracy                           0.86       319
   macro avg       0.91      0.83      0.86       319
weighted avg       0.88      0.86      0.86       319


Confusion Matrix for Best Model:
[[144   0   2]
 [ 19  54   2]
 [ 20   1  77]]


In [30]:
# Import the joblib library
import joblib

# Save the model and vectorizer
joblib.dump(best_model, "/content/drive/MyDrive/TextClassification/Model/best_model.pkl")
joblib.dump(tfidf_vectorizer, "/content/drive/MyDrive/TextClassification/Model/tfidf_vectorizer.pkl")
print(f"Model and vectorizer have been saved to the respective paths.")

Model and vectorizer have been saved to the respective paths.


In [31]:
# Load the model and vectorizer (for later predictions)
loaded_model = joblib.load("/content/drive/MyDrive/TextClassification/Model/best_model.pkl")
loaded_vectorizer = joblib.load("/content/drive/MyDrive/TextClassification/Model/tfidf_vectorizer.pkl")

In [10]:
# Load the saved model and vectorizer
import joblib
loaded_model = joblib.load("Model/best_model.pkl")
loaded_vectorizer = joblib.load("Model/tfidf_vectorizer.pkl")

# Predict on new data
pos_data = ["The sun warmed my face as I watched the children play, their laughter filling the air like a sweet melody."]
neg_data = ["I looked at the task ahead of me, the pieces scattered everywhere, and I could feel the weight of every minute slipping away with no progress."]
new_data = ["The sun warmed my face as I watched the children play, their laughter filling the air like a sweet melody.", "I couldn’t believe what I was hearing; every word seemed to hit harder than the last, making my pulse race with a tight knot in my chest.", "I looked at the task ahead of me, the pieces scattered everywhere, and I could feel the weight of every minute slipping away with no progress.", "I could barely sit still, my mind racing with all the possibilities, as the anticipation built up with each passing moment."]

# Preprocess the new data (text)
# new_data_processed = [preprocess_text(text) for text in new_data]
new_data_processed =  [preprocess_text(text) for text in neg_data]

# Transform the new data using the loaded vectorizer
X_new_tfidf = loaded_vectorizer.transform(new_data_processed)

# Make predictions using the loaded model
new_predictions = loaded_model.predict(X_new_tfidf)

nb_probabilities = loaded_model.predict_proba(X_new_tfidf)[0]  # (num_classes,)
# Print the predictions
print(new_predictions)
print(nb_probabilities)

['neg']
[0.67687668 0.16704448 0.15607884]
