In [1]:
import pandas as pd

file_path = './Combined Data.csv'
data = pd.read_csv(file_path)

data_info = data.info()
data_head = data.head()

data_info, data_head


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53043 entries, 0 to 53042
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  53043 non-null  int64 
 1   statement   52681 non-null  object
 2   status      53043 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


(None,
    Unnamed: 0                                          statement   status
 0           0                                         oh my gosh  Anxiety
 1           1  trouble sleeping, confused mind, restless hear...  Anxiety
 2           2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
 3           3  I've shifted my focus to something else but I'...  Anxiety
 4           4  I'm restless and restless, it's been a month n...  Anxiety)

In [2]:
# Clean the dataset
data_cleaned = data.dropna(subset=['statement'])  # Drop rows with missing 'statement'
data_cleaned = data_cleaned.drop(columns=['Unnamed: 0'])  # Drop the unnecessary index column

# Check the unique statuses for the emotional categories
unique_statuses = data_cleaned['status'].unique()
data_cleaned.shape, unique_statuses


((52681, 2),
 array(['Anxiety', 'Normal', 'Depression', 'Suicidal', 'Stress', 'Bipolar',
        'Personality disorder'], dtype=object))

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

X = data_cleaned['statement']
y = data_cleaned['status']



In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [5]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.9)),  
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [6]:
pipeline.fit(X_train, y_train)

In [7]:
y_pred = pipeline.predict(X_test)

In [8]:
accuracy = accuracy_score(y_test, y_pred)

In [9]:
print(accuracy)

0.6996298756761886


In [10]:
from sklearn.linear_model import LogisticRegression

pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.9)),  
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])


In [11]:

pipeline_lr.fit(X_train, y_train)


In [12]:

# Predict on the test set
y_pred_lr = pipeline_lr.predict(X_test)


In [13]:

# Calculate accuracy
accuracy_lr = accuracy_score(y_test, y_pred_lr)

accuracy_lr

0.7609376482869887

In [14]:
# Define a pipeline for text classification using Logistic Regression
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.9)),  # Text vectorization
    ('clf', LogisticRegression(max_iter=1000, random_state=42))  # Logistic Regression
])





In [15]:
pipeline_lr.fit(X_train, y_train)

In [16]:

# Predict on the test set
y_pred_lr = pipeline_lr.predict(X_test)

# Calculate accuracy
accuracy_lr = accuracy_score(y_test, y_pred_lr)

# Output the accuracy
print(f"Model Accuracy: {accuracy_lr:.4f}")

Model Accuracy: 0.7609


In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

pipeline_enhanced = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.8, min_df=5)),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])



In [18]:
# Cross-validation to evaluate the enhanced pipeline
cv_scores = cross_val_score(pipeline_enhanced, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validated Accuracy with Logistic Regression: {cv_scores.mean():.4f}")

Cross-validated Accuracy with Logistic Regression: 0.7522


In [19]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Trying LinearSVC for faster SVM performance
pipeline_linear_svc = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.8, min_df=5, max_features=5000)),
    ('clf', LinearSVC(random_state=42))
])


In [20]:
# Cross-validation with LinearSVC
cv_scores_linear_svc = cross_val_score(pipeline_linear_svc, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validated Accuracy with LinearSVC: {cv_scores_linear_svc.mean():.4f}")


Cross-validated Accuracy with LinearSVC: 0.7514


In [21]:
# Trying Multinomial Naive Bayes
pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.8, min_df=5, max_features=5000)),
    ('clf', MultinomialNB())
])


In [22]:
# Cross-validation with MultinomialNB
cv_scores_nb = cross_val_score(pipeline_nb, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validated Accuracy with Multinomial Naive Bayes: {cv_scores_nb.mean():.4f}")


Cross-validated Accuracy with Multinomial Naive Bayes: 0.6763


In [23]:
from sklearn.model_selection import GridSearchCV

param_grid_lr = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
}

pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.8, min_df=5, max_features=5000)),
    ('clf', LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'))  
])

grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train, y_train)

best_params_lr = grid_search_lr.best_params_
best_score_lr = grid_search_lr.best_score_

print(f"Best Logistic Regression params: {best_params_lr}, Accuracy: {best_score_lr:.4f}")

param_grid_svc = {
    'clf__C': [0.01, 0.1, 1, 10, 100],  
}

pipeline_svc = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.8, min_df=5, max_features=5000)),
    ('clf', LinearSVC(random_state=42, class_weight='balanced')) 
])

grid_search_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=5, scoring='accuracy')
grid_search_svc.fit(X_train, y_train)

best_params_svc = grid_search_svc.best_params_
best_score_svc = grid_search_svc.best_score_

print(f"Best LinearSVC params: {best_params_svc}, Accuracy: {best_score_svc:.4f}")

Best Logistic Regression params: {'clf__C': 1}, Accuracy: 0.7456
Best LinearSVC params: {'clf__C': 0.1}, Accuracy: 0.7502


In [24]:
import joblib

# Save the trained Logistic Regression model and TF-IDF vectorizer
joblib.dump(grid_search_lr.best_estimator_, 'logistic_model.pkl')
joblib.dump(grid_search_svc.best_estimator_, 'svm_model.pkl')
joblib.dump(grid_search_lr.best_estimator_.named_steps['tfidf'], 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [25]:
# Ensure that the logistic model and vectorizer were trained together
print(logistic_model)  # Should show the model details
print(tfidf_vectorizer)  # Should show the vectorizer details


NameError: name 'logistic_model' is not defined

In [26]:
# Ensure that the logistic model and vectorizer were trained together
print(logistic_model)  # Should show the model details
print(tfidf_vectorizer)  # Should show the vectorizer details


NameError: name 'logistic_model' is not defined

In [27]:
def analyze_emotions(input_text):
    # Using the pipeline to predict directly
    try:
        # The pipeline expects raw text input, not a sparse matrix
        prediction = pipeline.predict_proba([input_text])  # Input the raw text
        print(f"Prediction Shape: {prediction.shape}")  # Should be (1, n_classes)
        
    except Exception as e:
        print(f"Error during prediction: {e}")
        raise

    # Assuming classes like ['Anxiety', 'Stress', 'Depression', etc.]
    emotions = pipeline.classes_

    # Return the prediction as a dictionary
    return dict(zip(emotions, prediction[0]))


In [28]:
test_sentence = "I feel overwhelmed and anxious about the future, but I try to stay positive."
print(analyze_emotions(test_sentence))  # Test the function directly


Prediction Shape: (1, 7)
{'Anxiety': 0.27, 'Bipolar': 0.0, 'Depression': 0.29, 'Normal': 0.29, 'Personality disorder': 0.0, 'Stress': 0.02, 'Suicidal': 0.13}


In [29]:
@app.callback(
    [Output('output-result', 'children'),
     Output('emotion-graph', 'figure')],
    [Input('analyze-button', 'n_clicks'),
     Input('input-text', 'value')]
)
def analyze_text(n_clicks, input_text):
    if n_clicks > 0 and input_text:
        try:
            # Analyze emotions
            emotions = analyze_emotions(input_text)
            emotion_str = ', '.join([f'{emotion}: {prob:.2f}' for emotion, prob in emotions.items()])
            
            # Create a pie chart of emotions
            fig = go.Figure(data=[go.Pie(labels=list(emotions.keys()), values=list(emotions.values()))])
            fig.update_layout(title_text='Emotion Distribution')
            
            return f'This text indicates: {emotion_str}', fig
            
        except Exception as e:
            return f"Error during analysis: {e}", go.Figure()  # Return an empty figure on error

    # Default message and empty graph

    return "Enter your text to analyze.", go.Figure()


NameError: name 'app' is not defined