In [None]:
!pip install --upgrade transformers
!pip install simpletransformers
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
!pip install pyspellchecker

In [None]:
import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Gen RAM Free: 9.4 GB  |     Proc size: 5.7 GB
GPU RAM Free: 9656MB | Used: 5423MB | Util  36% | Total     15079MB


In [None]:
import numpy as np
import pandas as pd
from google.colab import files
from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')
import gc
from scipy.special import softmax
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import sklearn
from sklearn.metrics import log_loss
from sklearn.metrics import *
from sklearn.model_selection import *
import re
from spellchecker import SpellChecker
import random
from sklearn.preprocessing import LabelEncoder
import torch
pd.options.display.max_colwidth = 200

def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

seed_all(2)

In [None]:
spell = SpellChecker()
def get_correct_words(x):
    new_sentence = []
    for i in x.split():
        check_mispelled = spell.unknown([i])
        if len(check_mispelled) > 0:
            for word in check_mispelled:
                word_correct = spell.correction(word)
                new_sentence.append(word_correct)
        else:
            new_sentence.append(i)
    return ' '.join(new_sentence)

In [None]:
def convert_bhang_weed(x):
    new_sent = []
    for i in x.split():
        if i == 'bhang':
            new_sent.append('weed')
        else:
            new_sent.append(i)
    return ' '.join(new_sent)

In [None]:
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ZindiAfrica/Tech4MentalHealth/Train.csv')
test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ZindiAfrica/Tech4MentalHealth/Test.csv')
sample_sub = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ZindiAfrica/Tech4MentalHealth/SampleSubmission.csv')
#------------------------------------------
# train["text"]= train["text"]+" "+ train["label"]
le = LabelEncoder()
train["label"] = le.fit_transform(train["label"])
#==========================================
# train['text']= train['text'].apply(lambda x: x.lower())
# test['text'] = test['text'].apply(lambda x: x.lower())
# #------------------------------------------
train['text']= train['text'].apply(lambda x: re.sub(r"[^A-Za-z]", " ", x))
test['text'] = test['text'].apply(lambda x: re.sub(r"[^A-Za-z]", " ", x))
# #------------------------------------------
# train['text']= train['text'].apply(lambda x: convert_bhang_weed(x))
# test['text'] = test['text'].apply(lambda x: convert_bhang_weed(x))
# #------------------------------------------
train['text'] = train['text'].apply(lambda x:get_correct_words(x))
test['text'] = test['text'].apply(lambda x:get_correct_words(x))
# #------------------------------------------
train.shape, test.shape, sample_sub.shape

((616, 3), (309, 2), (309, 5))

In [None]:
display(le.classes_)
train.head()

array(['Alcohol', 'Depression', 'Drugs', 'Suicide'], dtype=object)

Unnamed: 0,ID,text,label
0,SUAVK39Z,i feel that it was better i dream happy,1
1,9JDAGUV3,why do i get hallucinations,2
2,419WR1LQ,i am stressed due to lack of financial support in school,1
3,6UY7DX6Q,why is life important,3
4,FYC0FTFB,how could i be helped to go through the depression,1


In [None]:
test.head()

Unnamed: 0,ID,text
0,02V56KMO,how to overcome bad feelings and emotions
1,03BMGTOK,i feel like giving up in life
2,03LZVFM6,i was so depressed feel like got no strength to continue
3,0EPULUM5,i feel so low especially since i had no one to talk to
4,0GM4C5GD,can i be successful when i am a drug addict


In [None]:
train.label.value_counts()

1    352
0    140
3     66
2     58
Name: label, dtype: int64

In [None]:
print(train['text'].apply(lambda x: len(x.split())).describe())

count    616.000000
mean       7.686688
std        4.032496
min        1.000000
25%        5.000000
50%        7.000000
75%        9.000000
max       35.000000
Name: text, dtype: float64


In [None]:
print(test['text'].apply(lambda x: len(x)).describe())

count    309.000000
mean      40.320388
std       21.673288
min        3.000000
25%       27.000000
50%       35.000000
75%       49.000000
max      167.000000
Name: text, dtype: float64


In [None]:
train1=train.drop(['ID'],axis=1)
test1=test.drop(['ID'],axis=1)
test1['label']=0
#---------------------------------------------
from sklearn.model_selection import train_test_split
X_train, X_test= train_test_split(train1, test_size=0.2, random_state=27,stratify=train1["label"])
train1 = X_train
test_set = test.drop(['ID'],axis=1)
test_set["label"] = 0

In [None]:
test_set

Unnamed: 0,text,label
0,how to overcome bad feelings and emotions,0
1,i feel like giving up in life,0
2,i was so depressed feel like got no strength to continue,0
3,i feel so low especially since i had no one to talk to,0
4,can i be successful when i am a drug addict,0
...,...,...
304,yes,0
305,my girlfriend dumped me,0
306,how can i go back to being my old self,0
307,is it true weed is medicinal,0


In [None]:
le.classes_

array(['Alcohol', 'Depression', 'Drugs', 'Suicide'], dtype=object)

In [None]:
le.classes_[0]

'Alcohol'

In [None]:
%%time
from sklearn.metrics import log_loss
score=[]
y_pred_tot=[]

fold=StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
i=1
for train_index, test_index in fold.split(train1,train1['label']):
    train1_trn, train1_val = train1.iloc[train_index], train1.iloc[test_index]
    # for i in range(len(train1_trn)):
    labels = {0:"Alcohol",1:"Depression",2:"Drugs",3:"Suicide"}
    # train1_trn["text"]= train1_trn["text"]+" "+ train1_trn["label"].map(labels)
    model = ClassificationModel('roberta', 'roberta-base', use_cuda=True,num_labels=4, args={'train_batch_size':16,
                                                                         'reprocess_input_data': True,
                                                                         'overwrite_output_dir': True,
                                                                         'fp16': False,
                                                                         'do_lower_case': False,
                                                                         'num_train_epochs': 2,
                                                                         'max_seq_length': 40,
                                                                         'regression': False,
                                                                         'manual_seed': 2,
                                                                         "learning_rate":7e-5,
                                                                         'weight_decay':0,
                                                                         "save_eval_checkpoints": False,
                                                                         "save_model_every_epoch": False,
                                                                         "silent": True})
    model.train_model(train1_trn)
    result, model_outputs, wrong_predictions = model.eval_model(train1_val)
    loss = log_loss(train1_val["label"],softmax(model_outputs,axis=1))
    print("LogLoss:",loss)
    score.append(loss)
    tst_result, tst_model_outputs, tst_wrong_predictions = model.eval_model(test_set)
    y_pred_tot.append(softmax(tst_model_outputs,axis=1))
print("Mean LogLoss: ",np.mean(score))

LogLoss: 0.36159881539642813
LogLoss: 0.3418080673366785
LogLoss: 0.5002500688162993
LogLoss: 0.46751308739565467
LogLoss: 0.355798281197037
LogLoss: 0.4862322390687709
LogLoss: 0.3621242179972481
LogLoss: 0.5133432977813848
LogLoss: 0.5649810534882911
LogLoss: 0.3588794755312253
Mean LogLoss:  0.4312528604009017
CPU times: user 1min 52s, sys: 47.5 s, total: 2min 40s
Wall time: 3min 35s


# Little Tuned

In [None]:
%%time
from sklearn.metrics import log_loss
score=[]
y_pred_tot=[]

fold=StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
i=1
for train_index, test_index in fold.split(train1,train1['label']):
    train1_trn, train1_val = train1.iloc[train_index], train1.iloc[test_index]
    model = ClassificationModel('roberta', 'roberta-base', use_cuda=True,num_labels=4, args={'train_batch_size':32,
                                                                         'reprocess_input_data': True,
                                                                         'overwrite_output_dir': True,
                                                                         'fp16': False,
                                                                         'do_lower_case': False,
                                                                         'num_train_epochs': 4,
                                                                         'max_seq_length': 40,
                                                                         'regression': False,
                                                                         'manual_seed': 2,
                                                                         "learning_rate":5e-5,
                                                                         'weight_decay':0,
                                                                         "save_eval_checkpoints": False,
                                                                         "save_model_every_epoch": False,
                                                                         "silent": True})
    model.train_model(train1_trn)
    result, model_outputs, wrong_predictions = model.eval_model(train1_val)
    loss = log_loss(train1_val["label"],softmax(model_outputs,axis=1))
    print("LogLoss:",loss)
    score.append(loss)
    tst_result, tst_model_outputs, tst_wrong_predictions = model.eval_model(test_set)
    y_pred_tot.append(softmax(tst_model_outputs,axis=1))
print("Mean LogLoss: ",np.mean(score))

LogLoss: 0.4005510895140469
LogLoss: 0.4275798309221864
LogLoss: 0.4178802389651537
LogLoss: 0.41579708352456896
LogLoss: 0.3315300497093371
LogLoss: 0.5237689775471784
LogLoss: 0.3572037878579327
LogLoss: 0.5521345305807737
LogLoss: 0.6527903166656591
LogLoss: 0.41458453678963136
Mean LogLoss:  0.4493820442076468
CPU times: user 2min 22s, sys: 1min 22s, total: 3min 45s
Wall time: 4min 45s


In [None]:
np.mean(y_pred_tot, 0).shape

(309, 4)

In [None]:
op = pd.DataFrame(np.mean(y_pred_tot, 0),columns=le.classes_)
op["ID"] = test["ID"]
op = op[["ID","Depression","Alcohol","Suicide","Drugs"]]
print(op.shape)
op.to_csv('20fold_rbbase_2_7e5_16_40.csv',index=False)

(309, 5)


In [None]:
model = ClassificationModel('roberta', 'roberta-base', use_cuda=True,num_labels=4, args={'train_batch_size':16,
                                                                         'reprocess_input_data': True,
                                                                         'overwrite_output_dir': True,
                                                                         'fp16': False,
                                                                         'do_lower_case': False,
                                                                         'num_train_epochs': 4,
                                                                         'max_seq_length': 40,
                                                                         'regression': False,
                                                                         'manual_seed': 2,
                                                                         "learning_rate":7e-5,
                                                                         'weight_decay':0,
                                                                         "save_eval_checkpoints": False,
                                                                         "save_model_every_epoch": False,
                                                                         "silent": False})

In [None]:
model.train_model(train_set)

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0', max=16.0, style=ProgressStyle(descripti…

Running loss: 0.768580


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1', max=16.0, style=ProgressStyle(descripti…

Running loss: 0.295655


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2', max=16.0, style=ProgressStyle(descripti…

Running loss: 0.435329


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3', max=16.0, style=ProgressStyle(descripti…

Running loss: 0.193241



In [None]:
result, model_outputs, wrong_predictions = model.eval_model(eval_set)

HBox(children=(FloatProgress(value=0.0, max=116.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=15.0, style=ProgressStyle(descri…




In [None]:

log_loss(eval_set["label"],softmax(model_outputs,axis=1))

0.3954912152920111

In [None]:
#Holdoutset : 0.407839043064164    Submission: 0.4096560688239747
#Holdoutset,Best: 0.3954912152920111   Submission: 0.384567126810665

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(test_set)

HBox(children=(FloatProgress(value=0.0, max=309.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=39.0, style=ProgressStyle(descri…




In [None]:
op = pd.DataFrame(softmax(model_outputs,axis=1),columns=le.classes_)
op["ID"] = test["ID"]
op = op[["ID","Depression","Alcohol","Suicide","Drugs"]]
op

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.716499,0.019466,0.256429,0.007606
1,03BMGTOK,0.984927,0.001353,0.012340,0.001380
2,03LZVFM6,0.991761,0.000975,0.005845,0.001418
3,0EPULUM5,0.989025,0.001158,0.008409,0.001408
4,0GM4C5GD,0.006381,0.433007,0.040308,0.520304
...,...,...,...,...,...
304,Z9A6ACLK,0.886585,0.012182,0.093875,0.007357
305,ZDUOIGKN,0.796393,0.014294,0.182710,0.006604
306,ZHQ60CCH,0.457861,0.335373,0.160336,0.046430
307,ZVIJMA4O,0.009315,0.038944,0.023493,0.928248


In [None]:
op.to_csv("submitRobertaBase2.csv",index=False)