# import modules


In [3]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.pipeline import make_pipeline
import joblib
import os
import sys
from tqdm import tqdm

In [2]:
def load_necessary_lib():
    # Preload NLTK data
    nltk.download('stopwords')
    nltk.download('punkt')
    print("Libraries loaded successfully")

In [1]:
def load_dataset():
    # Load the dataset if not present
    try:
        if os.path.exists('suicide_dataset.csv'):
            print("File exists")
            dataset_path = 'suicide_dataset.csv'
        else:
            dataset = load_dataset("Ram07/Detection-for-Suicide")
            df = pd.DataFrame(dataset['train'])
            df.to_csv('suicide_dataset.csv', index=False)
            dataset_path = 'suicide_dataset.csv'
    except Exception as e:
        print("Error loading dataset:", e)

    finally:
        # Load data directly from the CSV file
        try:
            data = pd.read_csv(dataset_path)
            print("Database loaded ...._")
            return clean_NA_data(data)
        except Exception as e:
            print("Error loading dataset:", e)
            
            sys.exit()

# //Remove rows without "suicidal" or "non-suicidal" labels
# //data = data[data['class'].isin(['suicidal', 'non-suicidal'])]
# !-----------------------------------------------------------------------------------------------------

In [4]:
def clean_NA_data(X):
    print(X.isna().sum())
    X.fillna(X.mean(), inplace=True)
    return X
    


load_dataset()

File exists


In [4]:

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    # print("preprocessing starting ...........")
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_tokens)


In [5]:
def train_model_SDM(X_train,y_train):
    # Train a classification model (SVM)
    vectorizer = TfidfVectorizer()
    classifier = SVC(kernel='linear', verbose=True, probability=True)  # Enable probability estimates
    model = make_pipeline(vectorizer, classifier)
    model.fit(X_train, y_train)
    print("Model trained successfully.")
    save_model()
    return model

## save the model as a .pkl file

In [6]:
def save_model():     
    # Save the trained model
    try:
        joblib.dump(model, 'suicide_detection_model.pkl')
        print("Model saved successfully.")
    except Exception as e:
        print("Error saving model:", e)


In [7]:
def evaluating_SDM(model,X_test, y_test):
    # Evaluate the model on the test set
    try:
        accuracy = model.score(X_test, y_test)
        print("Model Accuracy on Test Set:", round(accuracy*100,3),"%")
    except Exception as e:
        print("Error evaluating model:", e)

In [8]:
def start_model(X_train,y_train):
    print('start_model'.center(30,"_"))
    if os.path.exists("suicide_detection_model.pkl"):    
        # Load the saved model
        try:
            model = joblib.load('suicide_detection_model.pkl')
            return model
        except Exception as e:
            print("Error loading the model:", e)
    else:
        model=train_model_SDM(X_train,y_train)
        return model

In [9]:
def preprocess_text_with_progress(text_data, save_file=None):
    stop_words = set(stopwords.words('english'))
    
    def preprocess_text(text):
        tokens = nltk.word_tokenize(text.lower())
        filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
        return ' '.join(filtered_tokens)

    if save_file and os.path.exists(save_file):
        # If file exists, load processed text from file
        processed_text = pd.read_csv(save_file, header=None)[0].tolist()
    else:
        tqdm.pandas()
        processed_text = text_data.progress_apply(preprocess_text)
        
        # Save processed text to a file
        if save_file:
            processed_text.to_csv(save_file, index=False, header=False)

    return processed_text


# Run the detection program

In [10]:
def main():
    load_necessary_lib()
    data=load_dataset()
    # print(data)

    # Preprocess text

    # data['text'] = data['text'].apply(preprocess_text)
    data['text'] = preprocess_text_with_progress(data['text'],save_file='final_cleaned_processed_text.csv')   # ?with loading animation

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(data['text'], data['class'], test_size=0.2, random_state=42)
    print("Spliting Done")
    model=start_model(X_train,y_train)
    evaluating_SDM(model,X_test, y_test)
    

    cont=True
    while cont==True:
        # -------------------------->>>>>>>  Accept user input
        try:
            print("**".center(120,"-"),"\n\n")
            user_input = input("Enter your response: ").strip()
            if user_input=="quit":
                cont=False
            elif user_input:
                # Preprocess user input and predict
                preprocessed_input = preprocess_text(user_input)
                prediction_scores = model.predict_proba([preprocessed_input])[0]
                prediction = model.predict([preprocessed_input])[0]

                # Convert prediction back to original labels
                predicted_label = 'suicidal' if prediction == 0 else 'non-suicidal'  # Adjusted this line
                # suicidal_p_scores[1]=prediction_scores[1] * 100
                # non_suicidal_p_scores[0]=predicted_scores[0] * 100
                # Output prediction result and scores
                print("Prediction Score for Suicidal:", "{:.2f}%".format(prediction_scores[0] * 100))
                print("Prediction Score for Non-Suicidal:", "{:.2f}%".format(prediction_scores[1] * 100))
                
                return (prediction_scores,model,X_train, X_test, y_train, y_test)
            else:
                print("Empty input. Please provide a response.")
        except Exception as e:
            print("Error processing user input because ;", e)




In [11]:

try:
    prediction_scores,model,X_train, X_test, y_train, y_test=main()
    # plot_confusion_matrix(model, X_test, y_test)
except KeyboardInterrupt:
    print("\nExiting the program.")
except Exception as e:
    print("An error occurred:", e)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Libraries loaded successfully
File exists
Database loaded ...._
Spliting Done
_________start_model__________
Error evaluating model: np.nan is an invalid document, expected byte or unicode string.
-----------------------------------------------------------**----------------------------------------------------------- 


Prediction Score for Suicidal: 0.00%
Prediction Score for Non-Suicidal: 100.00%


In [14]:
%pip install seaborn
%pip install matplotlib

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting matplotlib!=3.6.1,>=3.4 (from seaborn)
  Using cached matplotlib-3.8.3-cp312-cp312-win_amd64.whl.metadata (5.9 kB)
Collecting contourpy>=1.0.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached contourpy-1.2.0-cp312-cp312-win_amd64.whl.metadata (5.8 kB)
Collecting cycler>=0.10 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached fonttools-4.50.0-cp312-cp312-win_amd64.whl.metadata (162 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached kiwisolver-1.4.5-cp312-cp312-win_amd64.whl.metadata (6.5 kB)
Collecting pillow>=8 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached pillow-10.2.0-cp312-cp312-win_amd64.whl.metadata (9.9 kB)
Collecting pyparsing>=2.3.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached pyparsing-3.

In [12]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


In [13]:

# Assuming 'model' contains the trained model and 'X_test' contains the test data
# Make predictions on the test data
def plot_confusion_matrix(model, X_test, y_test):    
    y_pred = model.predict(X_test)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt='g', xticklabels=['Non-Suicidal', 'Suicidal'], yticklabels=['Non-Suicidal', 'Suicidal'])
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()



In [14]:
plot_confusion_matrix(model, X_test, y_test)

ValueError: np.nan is an invalid document, expected byte or unicode string.