# Load The Dataset

In [None]:
import pandas as pd

df = pd.read_csv("dataset.csv")
df.head()

# Data Cleaning

In [None]:
# remove the unnamed column
df = df.drop(columns=['Unnamed: 0'])

# remove null values
df = df.dropna()

# Natural Language Processing

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

stopwords = set(stopwords.words('english'))

def clean_data(text):
    # lower-case and remove extra spaces
    text = text.strip().lower()

    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # remove stopwords
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords]
    return " ".join(tokens)

df["text"] = df["text"].apply(clean_data)

# Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt

df["class"].value_counts().plot(kind="bar")

plt.xlabel("Column class")
plt.ylabel("Number of rows")
plt.show()

In [None]:
from wordcloud import WordCloud

text = " ".join(df[df["class"] == "suicide"]["text"])

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 5))
plt.title("Commonly Used Words in Suicidal Texts")
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
text = " ".join(df[df["class"] == "non-suicide"]["text"])

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 5))
plt.title("Commonly Used Words in Non-Suicidal Texts")
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

# Feature Engineering

In [None]:
X = df["text"]
y = df["class"]

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

# Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train The Model

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

# Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Save Model, Vectorizer and Label Encoder

In [None]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)