------------------
 ## Suicide Sentiment Analysis Project 
 - Using TF-IDF As Feature Extraction
 - Using Some Classification models As RandomForest, LinearSVC, MultinomialNB
 - Using Some Preprocessing as Lemmatization, Removing Stop Words
 - Finally,  The best Results in this notebook is 91%.
----------------

In [1]:
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import metrics

## Read Suicide_Detection File

In [2]:
Suicide = pd.read_csv('Suicide_Detection.csv')
data_split = np.array_split(Suicide, 20)
Suicide = data_split[0]
Suicide = Suicide.drop('Unnamed: 0',axis=1)

## Preparing For Stopword removal and lemmatization

In [3]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/saied/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
X = Suicide.drop('class', axis=1)
y = Suicide['class']

# Text Pre Proceessing

In [5]:
# To remove emails
email_regex = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
regexes_to_remove = [email_regex, r'Subject:', r'Re:']

for i in range(0, len(X)):
    # removing all special charachter
    review = re.sub('[^a-zA-Z]', ' ', str(X['text'][i]))
    # make document as lowerCase
    review = review.lower()
    # splitting the documents into words for ex ['iam', 'omar']
    review = review.split()
    # make limmatization --> (change, changing, changes)---> (change)
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    # join the document agian
    review = ' '.join(review)
    
    # removing mails
    for r in regexes_to_remove:
        X['text'][i] = re.sub(r, '', review)


## Splitting Data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Feature extraction

In [7]:
tfidf_vectorizer_n1 = TfidfVectorizer(max_features=10000)
tfidf_vectorizer_n2 = TfidfVectorizer(max_features=10000, ngram_range=(2,2))
tfidf_vectorizer_n12 = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
tfidf_vectorizer_n3 = TfidfVectorizer(max_features=10000, ngram_range=(3,3))
tfidf_vectorizer_n123 = TfidfVectorizer(max_features=10000, ngram_range=(1,3))

X_tfidf_train_n1 = tfidf_vectorizer_n1.fit_transform(X_train['text'])
X_tfidf_test_n1 = tfidf_vectorizer_n1.transform(X_test['text'])

X_tfidf_train_n2 = tfidf_vectorizer_n2.fit_transform(X_train['text'])
X_tfidf_test_n2 = tfidf_vectorizer_n2.transform(X_test['text'])

X_tfidf_train_n12 = tfidf_vectorizer_n12.fit_transform(X_train['text'])
X_tfidf_test_n12 = tfidf_vectorizer_n12.transform(X_test['text'])

X_tfidf_train_n3 = tfidf_vectorizer_n3.fit_transform(X_train['text'])
X_tfidf_test_n3 = tfidf_vectorizer_n3.transform(X_test['text'])

X_tfidf_train_n123 = tfidf_vectorizer_n123.fit_transform(X_train['text'])
X_tfidf_test_n123 = tfidf_vectorizer_n123.transform(X_test['text'])

X_tfidf_train_list = [X_tfidf_train_n1, X_tfidf_train_n2, X_tfidf_train_n3, X_tfidf_train_n12,X_tfidf_train_n123]
X_tfidf_test_list = [X_tfidf_test_n1, X_tfidf_test_n2, X_tfidf_test_n3, X_tfidf_test_n12,X_tfidf_test_n123]

In [8]:
print(X_tfidf_train_n1.shape)

(8122, 10000)


---------------
- As we see the no. of features very large so we need to make feature selection and feature scaling
------------

## Feature Scaling

In [10]:
scaler = MinMaxScaler()
X_norm_list = []
X_test_norm_list = []

# function to fit data(calculate the min and max) then transform data to it
for X_tfidf_train, X_tfidf_test in zip(X_tfidf_train_list, X_tfidf_test_list):
    X_norm_list.append(X_tfidf_train.toarray())
    X_test_norm_list.append(X_tfidf_test.toarray())


## Feature Selection

In [9]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

X_selection_list = []
X_test_selection_list = []
# We Can select any model but linearSVC has l1 norm penality which deals with sparse
lsvc = LinearSVC(C=100, penalty='l1', max_iter=500, dual=False)

for X_norm,X_test_norm in zip(X_norm_list, X_test_norm_list):
    lsvc.fit(X_norm, y_train)

    # This function select the best features that has high weigh
    fs = SelectFromModel(lsvc, prefit=True)
    # This function redeuce X to the selected features
    X_selection_list.append(fs.transform(X_norm))
    X_test_selection_list.append(fs.transform(X_test_norm))


NameError: name 'X_norm_list' is not defined

##  Using LinearSVC

In [None]:
lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
grams = [1,2,3,12,123]
for X_selection,X_test_selection,i in zip(X_selection_list, X_test_selection_list,grams):
    lsvc.fit(X_selection, y_train)
    y_predict_1 = lsvc.predict(X_test_selection)
    print(i,"-gram\n",metrics.classification_report(y_test, y_predict_1, target_names=['Suicide', 'Non-Suicide']))

## Using RandomForest

In [None]:
for X_selection,X_test_selection,i in zip(X_selection_list, X_test_selection_list,grams):
    clf = RandomForestClassifier(max_depth=10)
    clf.fit(X_selection, y_train)
    y_predict_2 = clf.predict(X_test_selection)
    print(i,"-gram\n",metrics.classification_report(y_test, y_predict_2, target_names=['Suicide', 'Non-Suicide']))

## Using Multinomial Naive Bayes

In [14]:
for X_selection,X_test_selection,i in zip(X_selection_list, X_test_selection_list,grams):
    mb = MultinomialNB()
    mb.fit(X_selection, y_train)
    y_predict_3 = mb.predict(X_test_selection)
    print(i,"-gram\n",metrics.classification_report(y_test, y_predict_3, target_names=['Suicide', 'Non-Suicide']))

1 -gram
               precision    recall  f1-score   support

     Suicide       0.94      0.81      0.87      1773
 Non-Suicide       0.83      0.95      0.88      1709

    accuracy                           0.88      3482
   macro avg       0.88      0.88      0.88      3482
weighted avg       0.88      0.88      0.88      3482

2 -gram
               precision    recall  f1-score   support

     Suicide       0.85      0.82      0.84      1773
 Non-Suicide       0.82      0.85      0.84      1709

    accuracy                           0.84      3482
   macro avg       0.84      0.84      0.84      3482
weighted avg       0.84      0.84      0.84      3482

3 -gram
               precision    recall  f1-score   support

     Suicide       0.68      0.92      0.79      1773
 Non-Suicide       0.88      0.56      0.68      1709

    accuracy                           0.74      3482
   macro avg       0.78      0.74      0.73      3482
weighted avg       0.78      0.74      0.73    

## Using Ensamble Learning 

In [15]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression,  SGDClassifier

In [16]:
log_clf = LogisticRegression(solver="lbfgs")
rnd_clf = RandomForestClassifier(n_estimators=100)
svm_clf = SVC(gamma="scale", probability=True)
mb = MultinomialNB()
sgd = SGDClassifier(alpha=.0001, max_iter=50, loss='log',
                                       penalty="elasticnet", n_jobs=-1)

voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf), ('mb', mb),('sgd', sgd)],
voting='soft')
#voting_clf.fit(X_selection, y_train)

In [17]:
for X_selection,X_test_selection,i in zip(X_selection_list, X_test_selection_list,grams):
    voting_clf.fit(X_selection, y_train)
    y_predict_4 = voting_clf.predict(X_test_selection)
    print(i,"-gram\n",metrics.classification_report(y_test, y_predict_4, target_names=['Suicide', 'Non-Suicide']))


1 -gram
               precision    recall  f1-score   support

     Suicide       0.91      0.92      0.92      1753
 Non-Suicide       0.92      0.91      0.91      1729

    accuracy                           0.91      3482
   macro avg       0.92      0.91      0.91      3482
weighted avg       0.92      0.91      0.91      3482

2 -gram
               precision    recall  f1-score   support

     Suicide       0.82      0.85      0.84      1753
 Non-Suicide       0.84      0.82      0.83      1729

    accuracy                           0.83      3482
   macro avg       0.83      0.83      0.83      3482
weighted avg       0.83      0.83      0.83      3482

3 -gram
               precision    recall  f1-score   support

     Suicide       0.69      0.92      0.78      1753
 Non-Suicide       0.87      0.58      0.69      1729

    accuracy                           0.75      3482
   macro avg       0.78      0.75      0.74      3482
weighted avg       0.78      0.75      0.74    

## Using Bagging Method

In [17]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=500,
max_samples=100, bootstrap=True, n_jobs=-1)

for X_selection,X_test_selection,i in zip(X_selection_list, X_test_selection_list,grams):
    bag_clf.fit(X_selection, y_train)
    y_pred_5 = bag_clf.predict(X_test_selection)
    print(i,"-gram\n",metrics.classification_report(y_test, y_pred_5, target_names=['Suicide', 'Non-Suicide']))

1 -gram
               precision    recall  f1-score   support

     Suicide       0.82      0.89      0.85      1773
 Non-Suicide       0.88      0.79      0.83      1709

    accuracy                           0.84      3482
   macro avg       0.85      0.84      0.84      3482
weighted avg       0.85      0.84      0.84      3482

2 -gram
               precision    recall  f1-score   support

     Suicide       0.60      0.96      0.74      1773
 Non-Suicide       0.89      0.34      0.49      1709

    accuracy                           0.65      3482
   macro avg       0.75      0.65      0.61      3482
weighted avg       0.74      0.65      0.62      3482



  _warn_prf(average, modifier, msg_start, len(result))


3 -gram
               precision    recall  f1-score   support

     Suicide       0.51      1.00      0.67      1773
 Non-Suicide       0.00      0.00      0.00      1709

    accuracy                           0.51      3482
   macro avg       0.25      0.50      0.34      3482
weighted avg       0.26      0.51      0.34      3482

12 -gram
               precision    recall  f1-score   support

     Suicide       0.82      0.89      0.85      1773
 Non-Suicide       0.88      0.79      0.83      1709

    accuracy                           0.84      3482
   macro avg       0.85      0.84      0.84      3482
weighted avg       0.85      0.84      0.84      3482

123 -gram
               precision    recall  f1-score   support

     Suicide       0.82      0.89      0.85      1773
 Non-Suicide       0.87      0.79      0.83      1709

    accuracy                           0.84      3482
   macro avg       0.84      0.84      0.84      3482
weighted avg       0.84      0.84      0.84 