<a href="https://colab.research.google.com/github/sanduerhan/Licenta/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset Import

In [28]:
#importing the dataset
import pandas as pd
col_list = ["Column2", "Column3"]
dataset = pd.read_excel("Training.data.xlsx", usecols=col_list)
testset = pd.read_excel("Dev.xlsx", usecols=col_list)
train_x = dataset["Column2"]
train_y = dataset["Column3"]
test_x = testset["Column2"]
test_y = testset["Column3"]

# PreProcessing

In [29]:
# cleaning the text : deleting the emojis, the existing emails, the punctuation, the present digits,
# the hyperlinks and the stopwords (a,the,is, etc)
import nltk
import openpyxl
from nltk.corpus import stopwords, words, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re, string
from string import punctuation, digits
import nltk
nltk.download('stopwords')
def delete_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", re.UNICODE)
    return emoji_pattern.sub(r'', text)  # no emoji


def delete_digits(text):
    text = text.lower()
    clean = text.translate(str.maketrans('', '', digits))
    return clean


def delete_punctuation(text):
    clean = text.translate(str.maketrans('', '', punctuation + '’“”'))
    return clean


stop = set(stopwords.words('english'))
punct = list(string.punctuation)
stop.update(punct)


def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)


def remove_hyperlinks(text):
    for word in text:
        if re.match(r'^http', word):
            text.remove(word)
    return " ".join(text)


def remove_emails(text):
    text = text.split()
    for i in text:
        if '@' in i.strip().lower():
            text.remove(i)
    return " ".join(text)


def denoise_text(text):
    text = remove_emails(text)
    text = delete_punctuation(text)
    text = delete_emoji(text)
    text = delete_digits(text)
    text = remove_stopwords(text)
    text = text.split()
    text = remove_hyperlinks(text)
    return text


train_x = train_x.apply(denoise_text)
test_x = test_x.apply(denoise_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
waiting mind breakdown new year feeling isnt anymore dont know anyone else im little bit worried ill go back depressed days time something last year tried breakdowns start mere days later broke crying wasnt entire year december ok month wait weird way act feel feels bit normal


# Lemmatization

In [34]:
# lemmatization : crying -> cry, days -> day
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
def get_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def lemm(text):
    text = text.split()
    lemmatizer = WordNetLemmatizer()
    sar_list_lemmatizer = [lemmatizer.lemmatize(word, get_pos(word)) for word in text]
    return " ".join(sar_list_lemmatizer)

train_x = train_x.apply(lemm)
test_x = test_x.apply(lemm)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Count Vectorizer

In [35]:
# CountVectorizer reprezentation for the user tweets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
vectorizer = CountVectorizer()
x_train_cv = vectorizer.fit_transform(train_x)

x_test_cv = vectorizer.transform(test_x)


# Data imbalance handling

In [36]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

sm = SMOTE(random_state = 42)

res_x, res_y = sm.fit_resample(x_train_cv, train_y)
# print("After OverSampling, counts of label '1': {}".format(sum(res_y == "severe")))

In [37]:
os = RandomOverSampler()
res_x2, res_y2 = os.fit_resample(x_train_cv, train_y)

# Tf-Idf 

In [46]:
# Tf-idf reprezentation for the tweets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df = 0.5)
train_x_tf = vectorizer.fit_transform(train_x)
test_x_tf = vectorizer.transform(test_x)

res_tfx, res_tfy = sm.fit_resample(train_x_tf, train_y)

# Naive Bayes

In [None]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(res_x, res_y)
pred_y = naive_bayes_classifier.predict(x_test_cv)

score1 = metrics.accuracy_score(test_y, pred_y)
print("Accuracy " + str(score1))
print(metrics.classification_report(test_y, pred_y))

In [51]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(res_tfx, res_tfy)
pred_y = naive_bayes_classifier.predict(test_x_tf)
score1 = metrics.accuracy_score(test_y, pred_y)
print("Accuracy " + str(score1))
print(metrics.classification_report(test_y, pred_y))

Accuracy 0.48398576512455516
                precision    recall  f1-score   support

      moderate       0.57      0.69      0.62      2306
not depression       0.71      0.22      0.33      1830
        severe       0.16      0.51      0.25       360

      accuracy                           0.48      4496
     macro avg       0.48      0.47      0.40      4496
  weighted avg       0.59      0.48      0.48      4496



# AdaBoost

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# res_x2, res_y2 = sm.fit_resample(train_x_tf, train_y)
ada = AdaBoostClassifier()

boost = ada.fit(train_x_tf, train_y)
y_pred = boost.predict(test_x_tf)
print("AdaBoost Classifier Model Accuracy:", accuracy_score(test_y, y_pred))
print(metrics.classification_report(test_y, y_pred))

# Regression

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats=3)
model = model.fit(res_tfx,res_tfy)
y_pred = model.predict(test_x_tf)
print("Regression Classifier Model Accuracy:", accuracy_score(test_y, y_pred))
print(metrics.classification_report(test_y, y_pred))

# Word2Vec

In [70]:
# Word2Vec reprezentation
import gensim 
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec
import tensorflow
from tensorflow import keras
from keras.preprocessing.text import one_hot,Tokenizer
from keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')
corpus_text = 'n'.join(train_x)
# corpus_text = 'n'.join(res_x)
data = []
# iterate through each sentence in the file
for i in sent_tokenize(corpus_text):
    temp = []
    # tokenize the sentence into words
    for j in word_tokenize(i):
        temp.append(j.lower())
    data.append(temp)

model1 = gensim.models.Word2Vec(min_count = 5,size = 300, window = 5, alpha = 0.025)
model1.build_vocab(data)
print(len(model1.wv.vocab))
model1.train(data, total_examples = model1.corpus_count, epochs=30)

print(model1.wv.most_similar(positive=["school"]))

vocab = list(model1.wv.vocab.keys())

word2vec_dict ={}
for word in vocab:
  word2vec_dict[word] = model1.wv.get_vector(word)

maxi=-1
for i,rev in enumerate(train_x):
  tokens=rev.split()
  if(len(tokens)>maxi):
    maxi=len(tokens)
# print(maxi)



tok = Tokenizer()
tok.fit_on_texts(train_x)
vocab_size = len(tok.word_index) + 1
encd_rev = tok.texts_to_sequences(train_x)

max_len = 1464
embed_dim = 300
pad_rev = pad_sequences(encd_rev, maxlen=max_len, padding='post')

# print(pad_rev.shape)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
5998
[('still', 0.9996699094772339), ('apparently', 0.9996106624603271), ('come', 0.9996073842048645), ('sound', 0.9995858669281006), ('brother', 0.9995836615562439), ('problem', 0.9995808601379395), ('also', 0.9995766878128052), ('everybody', 0.9995734691619873), ('cause', 0.999566912651062), ('stress', 0.9995615482330322)]


# Neural Network

In [71]:
import numpy as np
from keras.initializers import Constant
from keras import layers
from keras.layers import ReLU
from keras.layers import Dropout
from keras.models import Sequential
from keras.layers import Dense , Flatten ,Embedding,Input,CuDNNLSTM,LSTM
from keras.models import Model
embed_matrix = np.zeros(shape=(vocab_size, embed_dim))
for word,i in tok.word_index.items():
  embed_vector=word2vec_dict.get(word)
  if embed_vector is not None:  # word is in the vocabulary learned by the w2v model
    embed_matrix[i]=embed_vector
train = []
for row in train_y:
  if row == "moderate":
    train.append(1)
  elif row == "severe":
    train.append(2)
  elif row == "not depression":
    train.append(0)

Y=keras.utils.to_categorical(train)  # one hot target as required by NN.
x_train,x_test,y_train,y_test=train_test_split(pad_rev,Y,test_size=0.25,random_state=42)

model=Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=embed_dim,input_length=max_len,embeddings_initializer=Constant(embed_matrix)))
model.add(Flatten())
model.add(Dense(16,activation='relu'))
model.add(Dropout(0.50))
# model.add(Dense(16,activation='relu'))
# model.add(Dropout(0.20))
model.add(Dense(3,activation='softmax'))

model.compile(optimizer=keras.optimizers.RMSprop(learning_rate=1e-3),
loss='categorical_crossentropy',metrics=['accuracy'])


model=model.fit(x_train, y_train, epochs=5, batch_size=64, validation_data=(x_test,y_test))
# pred_y = model.predict(x_test)
# metrics.classification_report(y_test, pred_y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


AttributeError: ignored