<a href="https://colab.research.google.com/github/rihemmaarefe/EmotionDetection/blob/main/prod.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load the necessary libraries and data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/stage/trainDataset.csv')

X = data['meaning']
y = data['category']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the data using the CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Train a machine learning model
model = MultinomialNB()
model.fit(X_train_counts, y_train)

# Evaluate the model's accuracy on the testing set
y_pred = model.predict(X_test_counts)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Load the new file to be labeled
new_data = pd.read_csv('/content/drive/MyDrive/stage/data/546070.csv')  # Adjust the path to your new file

# Handling missing values in the new data
new_data['meaning'] = new_data['meaning'].fillna('')  # Replace NaN values with an empty string

# Preprocess the new data using the same vectorizer
X_new_counts = vectorizer.transform(new_data['meaning'])

# Make predictions on the new data
y_pred = model.predict(X_new_counts)

# Add the predicted labels to the new data
new_data['predicted_label1'] = y_pred

# Save the labeled data to a new file
new_data.to_csv('/content/drive/MyDrive/stage/model/withPrediction.csv', index=False)  # Adjust the path and filename as needed

In [None]:
!pip install fasttext

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load the dataset
dataset = pd.read_csv('/content/drive/MyDrive/stage/model/withPrediction.csv')

# Clean the "meaning" column
def clean_text(text):
    if pd.isnull(text):  # Check if the value is NaN
        return ''  # Replace with an empty string

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Join the tokens back into a single string
    cleaned_text = ' '.join(filtered_tokens)

    return cleaned_text

# Apply the cleaning function to the "meaning" column
dataset['meaning'] = dataset['meaning'].apply(clean_text)

# Handle missing values
dataset['meaning'].fillna('', inplace=True)

# Feature extraction using TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset['meaning'])

# clustering
num_clusters = 10  # Set the number of clusters (10 emotiomns)
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)

# Map cluster labels to emotion names
emotion_names = ['joy', 'sadness', 'anger', 'excitement', 'fear', 'love', 'trust', 'disgust', 'anticipation', 'surprise']
cluster_names = [emotion_names[label] for label in kmeans.labels_]

# Create a new DataFrame with cluster labels and word information
cluster_data = pd.DataFrame({
    'Word': dataset['word'],
    'Type': dataset['type'],
    'Meaning': dataset['meaning'],
    'Prediction': dataset['predicted_label1'],
    'Emotion': cluster_names
})

# Save the new DataFrame to a CSV file
cluster_data.to_csv('emotionClusterF.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
