In [5]:
# # %pip install python-csv
# !pip install nltk
# !pip install scikit-learn
# !pip install pandas

In [6]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.pipeline import make_pipeline
import joblib
import os
import sys
from tqdm import tqdm
import time
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def load_necessary_lib():
    """
    Load necessary libraries for the project
    """
    # Preload NLTK data
    nltk.download('stopwords')
    nltk.download('punkt')
    print("Libraries loaded successfully")

In [39]:
def load_dataset_download(dataset_path="suicide_dataset.csv"):
    """
    loading dataset from the web if not present else , loading from the local directory
    """
    # Load the dataset if not present
    try:
        if os.path.exists(dataset_path):
            print("File exists")
        else:
            dataset = load_dataset("Ram07/Detection-for-Suicide")
            df = pd.DataFrame(dataset['train'])
            df.to_csv(dataset_path, index=False)
    except Exception as e:
        print("Error loading dataset:", e)
        sys.exit()

    finally:
        # Load data directly from the CSV file
        try:
            data = pd.read_csv(dataset_path)
            print(f"Database {dataset_path} loaded ...._")
            data = clean_na(data)
            return clean_dataset(data)
        except Exception as e:
            print("Error loading dataset:", e)
            sys.exit()

In [37]:
# 

Unnamed: 0,class,text,cleaned_text
0,0,Ex Wife Threatening SuicideRecently I left my ...,sex wife threaten suicide recently leave wife ...
1,1,Am I weird I don't get affected by compliments...,weird not affect compliment come know real lif...
2,1,Finally 2020 is almost over... So I can never ...,finally hear bad year swear fucking god annoying
3,0,i need helpjust help me im crying so hard,need help just help cry hard
4,0,It ends tonight.I can’t do it anymore. \nI quit.,end tonight not anymore quit


In [41]:
def clean_dataset(dataset):
    binary_convert = lambda x: 0 if x == "suicide" else 1
    # data=pd.read_csv("suicide_dataset.csv")
    dataset['class'] = dataset['class'].apply(binary_convert)
    return dataset

# load_dataset_download().head()

File exists
Database suicide_dataset.csv loaded ...._
cleaning NA


Unnamed: 0,class,text,cleaned_text
0,0,Ex Wife Threatening SuicideRecently I left my ...,sex wife threaten suicide recently leave wife ...
1,1,Am I weird I don't get affected by compliments...,weird not affect compliment come know real lif...
2,1,Finally 2020 is almost over... So I can never ...,finally hear bad year swear fucking god annoying
3,0,i need helpjust help me im crying so hard,need help just help cry hard
4,0,It ends tonight.I can’t do it anymore. \nI quit.,end tonight not anymore quit


In [47]:
def preprocess_text(text):
    """Preprocess the text data
    >>> preprocess_text("took rest sleeping pills painkillers i want to end struggle of past 6 years")
    >>> "took rest sleeping pills painkillers want end struggle past 6 years"  # <----- this is in the str format
    """
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text.lower())  # Tokenization and convert to lowercase
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_tokens)

In [10]:
def train_model_SDM(X_train, y_train):
    # Train a classification model
    vectorizer = TfidfVectorizer()
    classifier = SVC(kernel='linear', verbose=True, probability=True)  # Enable probability estimates
    model = make_pipeline(vectorizer, classifier)
    model.fit(X_train, y_train)
    print("Model trained successfully.")
    save_model(model)
    return model

In [11]:
def save_model(model):     
    # Save the trained model
    try:
        joblib.dump(model, 'suicide_detection_model.pkl')
        print("Model saved successfully.")
    except Exception as e:
        print("Error saving model:", e)

In [23]:
def clean_na(data):
    # Remove rows with NaN values
    cleaned_data = data.dropna()
    print("cleaning NA")
    return cleaned_data

In [13]:
def start_model(X_train, y_train, train_model=False):
    print('start_model'.center(120, "-"))
    if not train_model and os.path.exists("suicide_detection_model.pkl"):
        # Load the saved model
        try:
            model = joblib.load('suicide_detection_model.pkl')
            return model
        except Exception as e:
            print("Error loading the model:", e)
    else:
        model = train_model_SDM(X_train, y_train)
        return model

In [14]:
def preprocess_text_with_progress(data, text_column='text', label_column='class', save_file=None):
    stop_words = set(stopwords.words('english'))

    def preprocess_text(text):
        if pd.isna(text):  # Check for NaN values
            return ''  # Replace NaN values with an empty string
        tokens = nltk.word_tokenize(text.lower())
        filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
        return ' '.join(filtered_tokens)

    tqdm.pandas(desc="Preprocessing Text")
    processed_text = data[text_column].progress_apply(preprocess_text)
    
    # Combine preprocessed text with labels
    processed_data = pd.concat([processed_text, data[label_column]], axis=1)

    if save_file:
        processed_data.to_csv(save_file, index=False)

    return processed_data

In [15]:
def evaluating_SDM(model, X_test, y_test):
    # Remove samples with NaN labels
    X_test = X_test[~y_test.isna()]
    y_test = y_test.dropna()

    # Evaluate the model on the test set with a spinning animation
    with tqdm(total=len(X_test), desc="Evaluating Model", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} {postfix}") as pbar:
        for _ in range(len(X_test)):
            time.sleep(0.1)  # Simulate evaluation time (remove in actual usage)
            pbar.update(1)
    
    try:
        accuracy = model.score(X_test, y_test)
        print("\nModel Accuracy on Test Set:", round(accuracy * 100, 3), "%")
    except Exception as e:
        print("Error evaluating model:", e)

In [16]:
def main():
    load_necessary_lib()
    data = load_dataset_download("suicide_dataset.csv")
    print("------------>>>>> Data loaded <<<<<-----------")

    # Preprocess text
    data_processed = preprocess_text_with_progress(data, text_column='text', label_column='class', save_file='final_cleaned_processed_text.csv')

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(data_processed['text'], data_processed['class'], test_size=0.2, random_state=42)
    print("Data split done.")

    # Train or load the model
    model = start_model(X_train, y_train, train_model=True)

    # Evaluate the model
    # evaluating_SDM(model, X_test, y_test)

    
    print("Running the user_response program".center(120,"_"),"\n")
    cont = True
    while cont:
        # Accept user input
        try:
            user_input = input("Enter your response: ").strip()
            if user_input == "quit":
                cont = False
            elif user_input:
                # Preprocess user input
                preprocessed_input = preprocess_text(user_input)

                # Predict
                prediction = model.predict([preprocessed_input])
                print("Prediction Data:", prediction)
                
                # Output prediction result
                prediction_scores = model.predict_proba([preprocessed_input])[0]
                print("Prediction Score for Suicidal:", "{}%".format(round(prediction_scores[1] * 100,3)))
                print("Prediction Score for Non-Suicidal:", "{:.2f}%".format(round(prediction_scores[0] * 100,3)))
                # Output prediction result
                print("Prediction:\t\t>>", prediction[0],"<<\n")


            else:
                print("Empty input. Please provide a response.")
        except Exception as e:
            print("Error processing user input:", e)

In [17]:
try:
    main()
except KeyboardInterrupt:
    print("\nExiting the program.")
except Exception as e:
    print("An error occurred:", e)
    sys.exit()  # Uncomment this line if you want to exit on error


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Libraries loaded successfully
File exists


Database suicide_dataset.csv loaded ...._
------------>>>>> Data loaded <<<<<-----------


Preprocessing Text: 100%|██████████| 174436/174436 [00:37<00:00, 4618.89it/s]


Data split done.
------------------------------------------------------start_model-------------------------------------------------------
___________________________________________Running the user_response program____________________________________________ 

Prediction Data: ['non-suicide']
Prediction Score for Suicidal: 0.074%
Prediction Score for Non-Suicidal: 99.93%
Prediction:		>> non-suicide <<
Prediction Data: ['non-suicide']
Prediction Score for Suicidal: 0.048%
Prediction Score for Non-Suicidal: 99.95%
Prediction:		>> non-suicide <<
Prediction Data: ['suicide']
Prediction Score for Suicidal: 99.999%
Prediction Score for Non-Suicidal: 0.00%
Prediction:		>> suicide <<


In [18]:
# from matplotlib import pyplot as plt
# from sklearn.metrics import confusion_matrix , classification_report
# import pandas as pd




# def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
#     """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
    
#     Arguments
#     ---------
#     confusion_matrix: numpy.ndarray
#         The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix. 
#         Similarly constructed ndarrays can also be used.
#     class_names: list
#         An ordered list of class names, in the order they index the given confusion matrix.
#     figsize: tuple
#         A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
#         the second determining the vertical size. Defaults to (10,7).
#     fontsize: int
#         Font size for axes labels. Defaults to 14.
        
#     Returns
#     -------
#     matplotlib.figure.Figure
#         The resulting confusion matrix figure
#     """
#     df_cm = pd.DataFrame(
#         confusion_matrix, index=class_names, columns=class_names, 
#     )
#     fig = plt.figure(figsize=figsize)
#     try:
#         heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
#     except ValueError:
#         raise ValueError("Confusion matrix values must be integers.")
#     heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
#     heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
#     plt.ylabel('Truth')
#     plt.xlabel('Prediction')
# truth =      ["Dog","Not a dog","Dog","Dog",      "Dog", "Not a dog", "Not a dog", "Dog",       "Dog", "Not a dog"]
# prediction = ["Dog","Dog",      "Dog","Not a dog","Dog", "Not a dog", "Dog",       "Not a dog", "Dog", "Dog"]
# cm = confusion_matrix(truth,prediction)
# print_confusion_matrix(cm,["Dog","Not a dog"])

------

In [19]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.metrics import confusion_matrix

# # Load the dataset
# data = pd.read_csv('final_cleaned_processed_text.csv')

# # Drop rows with NaN values
# data.dropna(inplace=True)

# # Convert class labels to numeric values
# data['class'] = data['class'].map({'suicide': 1, 'non-suicide': 0})

# # Extract true labels
# true_labels = data['class']

# # Example predicted labels (replace this with the predicted labels from your model)
# # Here, I'm just assuming all predictions are non-suicidal (0) for demonstration purposes
# predicted_labels = np.zeros(len(data))

# print("True Labels:", true_labels)
# print("Predicted Labels:", predicted_labels)

# # Create confusion matrix
# cm = confusion_matrix(true_labels, predicted_labels)

# # Plot confusion matrix (same as before)
# plt.figure(figsize=(8, 6))
# plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
# plt.title('Confusion Matrix')
# plt.colorbar()
# classes = ['Non-Suicidal', 'Suicidal']
# tick_marks = np.arange(len(classes))
# plt.xticks(tick_marks, classes, rotation=45)
# plt.yticks(tick_marks, classes)
# plt.xlabel('Predicted Label')
# plt.ylabel('True Label')
# plt.tight_layout()
# plt.show()


In [20]:
# import matplotlib.pyplot as plt

# # Example accuracy scores (replace this with your actual accuracy scores)
# accuracy_scores = [0.75, 0.80, 0.85, 0.90, 0.92]

# # Generate x-axis values (epochs or iterations)
# epochs = range(1, len(accuracy_scores) + 1)

# # Plot the accuracy graph
# plt.figure(figsize=(8, 6))
# plt.plot(epochs, accuracy_scores, marker='o', linestyle='-')
# plt.title('Model Accuracy Over Time')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.xticks(epochs)
# plt.grid(True)
# plt.show()
