------------------
 ## Suicide Sentiment Analysis Project 
 - Using TF-IDF As Feature Extraction
 - Using Some Classification models As RandomForest, LinearSVC, MultinomialNB
 - Using Some Preprocessing as Lemmatization, Removing Stop Words
 - Finally,  The best Results in this notebook is 91%.
----------------

In [31]:
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import metrics

## Read Suicide_Detection File

In [32]:
Suicide = pd.read_csv('Suicide_Detection.csv')
data_split = np.array_split(Suicide, 20)
Suicide = data_split[0]
Suicide = Suicide.drop('Unnamed: 0',axis=1)

## Preparing For Stopword removal and lemmatization

In [33]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/saied/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
X = Suicide.drop('class', axis=1)
y = Suicide['class']

# Text Pre Proceessing

In [35]:
# To remove emails
email_regex = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
regexes_to_remove = [email_regex, r'Subject:', r'Re:']

for i in range(0, len(X)):
    # removing all special charachter
    review = re.sub('[^a-zA-Z]', ' ', str(X['text'][i]))
    # make document as lowerCase
    review = review.lower()
    # splitting the documents into words for ex ['iam', 'omar']
    review = review.split()
    # make limmatization --> (change, changing, changes)---> (change)
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    # join the document agian
    review = ' '.join(review)
    
    # removing mails
    for r in regexes_to_remove:
        X['text'][i] = re.sub(r, '', review)


<a id='another_cell'></a>
### -ttttttttt

[TF-IDF ](#another_cell)

## Splitting Data

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Feature extraction

- ### TF-IDF

In [37]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_vectorizer_n12 = TfidfVectorizer(max_features=10000, ngram_range=(1,2))


X_tfidf_train = tfidf_vectorizer.fit_transform(X_train['text'])
X_tfidf_test = tfidf_vectorizer.transform(X_test['text'])

X_tfidf_train_n12= tfidf_vectorizer_n12.fit_transform(X_train['text'])
X_tfidf_test_n12=tfidf_vectorizer_n12.fit_transform(X_test['text'])

In [38]:
print(X_tfidf_train.shape)

(8122, 10000)


 - ### Bag of wards 

In [39]:
vectorizer = CountVectorizer()
  
X_bow_train = vectorizer.fit_transform(X_train['text'])
X_bow_test = vectorizer.transform(X_test['text'])


## Feature Scaling

---------------
- As we see the no. of features very large so we need to make feature selection and feature scaling
------------

## Feature Selection

In [40]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

# We Can select any model but linearSVC has l1 norm penality which deals with sparse
lsvc = LinearSVC(C=100, penalty='l1', max_iter=500, dual=False)
lsvc.fit(X_tfidf_train, y_train)

# This function select the best features that has high weigh
fs = SelectFromModel(lsvc, prefit=True)
# This function redeuce X to the selected features
X_selection = fs.transform(X_tfidf_train)
X_test_selection = fs.transform(X_tfidf_test)


lsvc.fit(X_tfidf_train_n12, y_train)
fs_n12 = SelectFromModel(lsvc, prefit=True)
X_selection_n12 = fs_n12.transform(X_tfidf_train_n12)
X_test_selection_n12 = fs_n12.transform(X_tfidf_test_n12)

lsvc.fit(X_bow_train, y_train)
fs_n12 = SelectFromModel(lsvc, prefit=True)
X_selection_bow = fs_n12.transform(X_bow_train)
X_test_selection_bow = fs_n12.transform(X_bow_test)




##  Using LinearSVC

In [46]:
lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
lsvc.fit(X_selection, y_train)
y_predict = lsvc.predict(X_test_selection)

lsvc_n12 = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
lsvc_n12.fit(X_selection_n12,y_train)
y_predict_n12 = lsvc_n12.predict(X_test_selection_n12)

lsvc_bow = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
lsvc_bow.fit(X_selection_bow,y_train)
y_predict_bow = lsvc_bow.predict(X_test_selection_bow)

# print(metrics.classification_report(y_test, y_predict_1, target_names=['Suicide', 'Non-Suicide']))
linear_svm_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_predict)
linear_svm_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_predict_n12)
linear_svm_bow_results=metrics.precision_recall_fscore_support(y_test, y_predict_bow)



In [47]:
 print(metrics.classification_report(y_test, y_predict, target_names=['Suicide', 'Non-Suicide']))

              precision    recall  f1-score   support

     Suicide       0.88      0.86      0.87      1737
 Non-Suicide       0.86      0.88      0.87      1745

    accuracy                           0.87      3482
   macro avg       0.87      0.87      0.87      3482
weighted avg       0.87      0.87      0.87      3482



In [48]:
linear_svm_tfidf_n12_results

(array([0.48687012, 0.4503012 ]),
 array([0.78986759, 0.1713467 ]),
 array([0.60241493, 0.24823578]),
 array([1737, 1745]))

In [49]:
linear_svm_bow_results

(array([0.85931345, 0.87683284]),
 array([0.8791019 , 0.85673352]),
 array([0.86909505, 0.86666667]),
 array([1737, 1745]))

## Using RandomForest

In [60]:
clf = RandomForestClassifier(max_depth=10)
clf.fit(X_selection, y_train)
y_predict_2 = clf.predict(X_test_selection)

clf.fit(X_selection_n12, y_train)
y_predict_n12_2 = clf.predict(X_test_selection_n12)

clf.fit(X_selection_bow, y_train)
y_predict_bow_2 = clf.predict(X_test_selection_bow)

In [61]:
RandomForest_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_predict)
RandomForest_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_predict_n12_2)
RandomForest_bow_results=metrics.precision_recall_fscore_support(y_test, y_predict_bow_2)
print(metrics.classification_report(y_test, y_predict_2, target_names=['Suicide', 'Non-Suicide']))

              precision    recall  f1-score   support

     Suicide       0.81      0.90      0.86      1737
 Non-Suicide       0.89      0.80      0.84      1745

    accuracy                           0.85      3482
   macro avg       0.85      0.85      0.85      3482
weighted avg       0.85      0.85      0.85      3482



In [62]:
RandomForest_tfidf_n12_results

(array([0.49913793, 1.        ]),
 array([1.        , 0.00114613]),
 array([0.66589994, 0.00228964]),
 array([1737, 1745]))

In [63]:
RandomForest_bow_results

(array([0.81042412, 0.90098361]),
 array([0.91306851, 0.78739255]),
 array([0.85868977, 0.84036697]),
 array([1737, 1745]))

## Using Multinomial Naive Bayes

In [68]:
mb = MultinomialNB()
mb.fit(X_selection, y_train)
y_predict_3 = mb.predict(X_test_selection)

mb.fit(X_selection_n12, y_train)
y_predict_n12_3 = mb.predict(X_test_selection_n12)

mb.fit(X_selection_bow, y_train)
y_predict_bow_3 = mb.predict(X_test_selection_bow)

print(metrics.classification_report(y_test, y_predict_3, target_names=['Suicide', 'Non-Suicide']))

mb_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_predict)
mb_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_predict_n12_3)
mb_bow_results=metrics.precision_recall_fscore_support(y_test, y_predict_bow_3)

              precision    recall  f1-score   support

     Suicide       0.94      0.81      0.87      1737
 Non-Suicide       0.83      0.95      0.89      1745

    accuracy                           0.88      3482
   macro avg       0.89      0.88      0.88      3482
weighted avg       0.89      0.88      0.88      3482



In [69]:
mb_tfidf_n12_results

(array([0.46600567, 0.36018237]),
 array([0.75762809, 0.13581662]),
 array([0.57706643, 0.19725343]),
 array([1737, 1745]))

In [70]:
mb_bow_results

(array([0.94570447, 0.82190429]),
 array([0.79217041, 0.95472779]),
 array([0.86215539, 0.88335101]),
 array([1737, 1745]))

## Using Ensamble Learning 

In [71]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression,  SGDClassifier
from sklearn.svm import SVC

In [72]:
log_clf = LogisticRegression(solver="lbfgs")
rnd_clf = RandomForestClassifier(n_estimators=100)
svm_clf = SVC(gamma="scale", probability=True)
mb = MultinomialNB()
sgd = SGDClassifier(alpha=.0001, max_iter=50, loss='log',
                                       penalty="elasticnet", n_jobs=-1)

voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf), ('mb', mb),('sgd', sgd)],
voting='soft')



In [73]:
voting_clf.fit(X_selection, y_train)
y_predict_4 = voting_clf.predict(X_test_selection)

voting_clf.fit(X_selection_n12, y_train)
y_predict_n12_4 = voting_clf.predict(X_test_selection_n12)

voting_clf.fit(X_selection_bow, y_train)
y_predict_bow_4 = voting_clf.predict(X_test_selection_bow)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [77]:
print(metrics.classification_report(y_test, y_predict_4, target_names=['Suicide', 'Non-Suicide']))
mb_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_predict_4)
mb_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_predict_n12_4)
mb_tfidf_bow_results=metrics.precision_recall_fscore_support(y_test, y_predict_bow_4)

              precision    recall  f1-score   support

     Suicide       0.91      0.92      0.92      1737
 Non-Suicide       0.92      0.91      0.92      1745

    accuracy                           0.92      3482
   macro avg       0.92      0.92      0.92      3482
weighted avg       0.92      0.92      0.92      3482



In [78]:
mb_tfidf_n12_results

(array([0.49971132, 0.66666667]),
 array([0.99654577, 0.00687679]),
 array([0.66564122, 0.01361316]),
 array([1737, 1745]))

In [79]:
mb_tfidf_bow_results

(array([0.88784009, 0.91790601]),
 array([0.92055268, 0.88424069]),
 array([0.90390051, 0.9007589 ]),
 array([1737, 1745]))

## Using Bagging Method

In [80]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=500,
max_samples=100, bootstrap=True, n_jobs=-1)

bag_clf.fit(X_selection, y_train)
y_pred_5 = bag_clf.predict(X_test_selection)

bag_clf.fit(X_selection_n12, y_train)
y_pred_n12_5 = bag_clf.predict(X_test_selection_n12)

bag_clf.fit(X_selection_bow, y_train)
y_pred_bow_5 = bag_clf.predict(X_test_selection_bow)
print(metrics.classification_report(y_test, y_pred_5, target_names=['Suicide', 'Non-Suicide']))

              precision    recall  f1-score   support

     Suicide       0.81      0.88      0.85      1737
 Non-Suicide       0.87      0.79      0.83      1745

    accuracy                           0.84      3482
   macro avg       0.84      0.84      0.84      3482
weighted avg       0.84      0.84      0.84      3482



In [81]:
bag_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_pred_5)
bag_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_pred_n12_5)
bag_tfidf_bow_results=metrics.precision_recall_fscore_support(y_test, y_pred_bow_5)

In [82]:
bag_tfidf_n12_results

(array([0.50043215, 1.        ]),
 array([1.        , 0.00630372]),
 array([0.66705069, 0.01252847]),
 array([1737, 1745]))

In [83]:
bag_tfidf_bow_results

(array([0.81001076, 0.85714286]),
 array([0.86643638, 0.79770774]),
 array([0.83727399, 0.82635797]),
 array([1737, 1745]))