#Setup

##Open Packages

In [None]:
import nltk
from nltk.stem.porter import PorterStemmer
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from nltk.probability import FreqDist
nltk.download('punkt')
from sklearn import naive_bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


##Open Data

In [None]:
data = pd.read_csv(r"/content/drive/MyDrive/Machine Learning/Data/SDoutput1.csv")
from ast import literal_eval
data['clean'] = data['clean'].apply(literal_eval)
data.loc[data["label"]=='suicide', "label"] = 1.0
data.loc[data["label"]=='non-suicide', "label"] = 0.0
print(type(data["label"][0]))

data.head()

<class 'float'>


Unnamed: 0.1,Unnamed: 0,clean,label
0,0,"[ex, wife, threaten, suiciderec, left, wife, g...",1.0
1,1,"[weird, get, affect, compliment, come, someon,...",0.0
2,2,"[final, 2020, almost, never, hear, 2020, ha, b...",0.0
3,3,"[need, helpjust, help, cri, hard]",1.0
4,4,"[losthello, name, adam, 16, struggl, year, afr...",1.0


#Functions


## Split data into Training and Test data

In [None]:
def dataSplit(data):
  training_data, test_data = train_test_split(data, train_size = 0.6)
  bow_transform = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[1,1], lowercase=False) 
  #Adjust for ngrams later
  # print(training_data.shape)
  # print(test_data.shape)

  x_train_bow = bow_transform.fit_transform(training_data['clean'])
  # print(len(bow_transform.vocabulary_))
  # print(x_train_bow)

  X_test_bow = bow_transform.transform(test_data['clean'])

  y_train = training_data['label']
  y_train=y_train.astype('int')
  y_test = test_data['label']
  y_test=y_test.astype('int')

  return x_train_bow, y_train, X_test_bow, y_test

##Create TFIDF Training and Test data

In [None]:
def tfidfData(x_train_bow, X_test_bow):
  tfidf_transform = TfidfTransformer(norm=None)
  X_train_tfidf = tfidf_transform.fit_transform(x_train_bow)
  X_test_tfidf = tfidf_transform.transform(X_test_bow)
  return X_train_tfidf, X_test_tfidf

##Logistic Regression Model Function

In [None]:
def logRegFunction(x_train, y_train, x_test, y_test, info ,C1):
  logReg = LogisticRegression(max_iter=5000, C = C1)
  logReg.fit(x_train, y_train)
  y_predicted_prob = logReg.predict_proba(x_test)
  y_predicted = logReg.predict(x_test)
  return (info,) + metricsCalc(y_test, y_predicted)

##Multinomial Naive Bayes Function

In [None]:
def mnNaiveBayesFunction(x_train, y_train, x_test, y_test, info):
  classifier = naive_bayes.MultinomialNB()
  classifier.fit(x_train, y_train)
  y_predicted = classifier.predict(x_test)
  return (info,) + metricsCalc(y_test, y_predicted)

##Random Forest Function

In [None]:
def randomForest(x_train, y_train, x_test, y_test, estimators, info): 
  classif = RandomForestClassifier(n_estimators= estimators)
  classif.fit(x_train, y_train)
  y_predicted = classif.predict(x_test)
  return (info,) + metricsCalc(y_test, y_predicted)

##Metrics Calculator

In [None]:
def metricsCalc(y_test, y_predicted):
  #acc score
  accuracy = accuracy_score(y_test.tolist(), y_predicted)

  #create seperate dataframes
  accuracyData = pd.DataFrame(
      {'Y' : y_test.tolist(), 'Y Predicted': y_predicted.tolist()} )
  GroundTruthSuicidal = accuracyData[accuracyData["Y"] == 1]
  GroundTruthNonSuicidal = accuracyData[accuracyData["Y"] == 0]

  #True Positive and False Negative Counts
  true_positive_count = GroundTruthSuicidal['Y Predicted'].tolist().count(1)
  false_negative_count = GroundTruthSuicidal['Y Predicted'].tolist().count(0)

  #True Negative and False Positive Counts
  true_negative_count = GroundTruthNonSuicidal['Y Predicted'].tolist().count(0)
  false_positive_count = GroundTruthNonSuicidal['Y Predicted'].tolist().count(1)

  #True Positive, True Negative, Precision 
  true_positive_rate = accuracy_score(GroundTruthSuicidal['Y'], GroundTruthSuicidal['Y Predicted'])
  true_negative_rate = accuracy_score(GroundTruthNonSuicidal['Y'], GroundTruthNonSuicidal['Y Predicted'])
  positive_precision = true_positive_count/(true_positive_count + false_positive_count)
  f1 = f1_score(y_test.tolist(), y_predicted.tolist())
  return accuracy, true_positive_rate, true_negative_rate, positive_precision, f1

##Add Data to File Function

In [None]:
def addDataToFile(results):
  file1 = open("/content/drive/MyDrive/Machine Learning/Data/results.csv", "a+")
  file1.write((','.join(str(item) for item in results)) + "\n")
  file1.close()

#Make Predictions

In [None]:
#Split Data
x_train_bow, y_train, x_test_bow, y_test = dataSplit(data)

#TFIDF X data
x_train_tfidf, x_test_tfidf = tfidfData(x_train_bow, x_test_bow)

#Log Reg BOW
results_bow_log = logRegFunction(x_train_bow, y_train, x_test_bow, y_test, "Bag of Words Logistic Regression")
addDataToFile(results_bow_log)

#Log Reg TFIDF
results_TDIF_log = logRegFunction(x_train_tfidf, y_train, x_test_tfidf, y_test, "TDIDF Logistic Regression")
addDataToFile(results_TDIF_log)

#MN Naive bayes BOW
results_BOW_mnb = mnNaiveBayesFunction(x_train_bow, y_train, x_test_bow, y_test, "Bag of Words Multinomial Naive Bayes")
addDataToFile(results_BOW_mnb)

#MN Naive Bayes TFIDF
results_TDIF_mnb = mnNaiveBayesFunction(x_train_tfidf, y_train, x_test_tfidf, y_test, "TFIDF Multinomial Naive Bayes")
addDataToFile(results_TDIF_mnb)

#Random Forest BOW
results_BOW_randForest = randomForest(x_train_bow, y_train, x_test_bow, y_test, 50, "Bag of Words Random Forest")
addDataToFile(results_BOW_randForest)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#Log Reg BOW
c_tests = [1e-5, 1e-3, 1e-1, 1e0, 1e1, 1e2]
for i in c_tests:
  print(i)
  results_bow_log = logRegFunction(x_train_bow, y_train, x_test_bow, y_test, ("Bag of Words Logistic Regression" + str(i)) ,i)
  addDataToFile(results_bow_log)

1e-05
0.001
0.1
1.0
10.0
100.0
