In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
import string
import gensim
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from tqdm.notebook import tqdm
import torch
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
import transformers
from transformers import BertForSequenceClassification
import re
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import nltk
import tensorflow as tf
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
from gensim import matutils, models
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
train=pd.read_csv('../input/60k-stack-overflow-questions-with-quality-rate/valid.csv')
test=pd.read_csv('../input/60k-stack-overflow-questions-with-quality-rate/train.csv')

In [None]:
test.isnull().sum()

In [None]:
train.head()

# EDA

In [None]:
#convert categorical data to numerical data
targets = {'HQ': 0, 'LQ_EDIT': 1, 'LQ_CLOSE': 2}
train['Y'] = train['Y'].map(targets)

In [None]:
train.shape

In [None]:
train.columns

In [None]:
sns.countplot(train['Y'])

In [None]:
#here we clean text data
train['text']=train['Title']+''+train['Body']

In [None]:
train=train[['text','Y']]

In [None]:
train.head(4)

# EDA

In [None]:
# load stop words
stop_word = stopwords.words('english')

In [None]:
def clean_data(data):
    data = data.lower()
    data = re.sub(r'[^(a-zA-Z)\s]','', data)
     #     remove urls
    data = re.sub(r'http\S+', " ", data)
 #     remove mentions
    data = re.sub(r'@\w+',' ',data)
 #     remove hastags
    data = re.sub(r'#\w+', ' ',data)
#     remove digits
    data = re.sub(r'\d+', ' ', data)
#     remove html tags and umber
    data = re.sub('r<.*?>',' ', data)
 #     remove stop words 
    data = data.split()
    data = " ".join([word for word in data if not word in stop_word])
    return data

In [None]:
train['text'] = train['text'].apply(lambda x:clean_data(x))
train.head()

In [None]:
train.drop_duplicates(inplace= True)
print(train.head(4))
print(train.shape)

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xval, ytrain, yval = train_test_split(train['text'], train['Y'], test_size = 0.2,random_state=0)

In [None]:
#bert tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)

In [None]:
#length of the trains set
seq_len=[len(i.split()) for i in xtrain]
pd.Series(seq_len).hist(bins=30)

In [None]:
#length of the test
seq_len=[len(i.split()) for i in xval]
pd.Series(seq_len).hist(bins=30)

# Extracting inputs and attention masks out of encoded data

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    xtrain, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=50, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    xval, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=50, 
    return_tensors='pt'
)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
input_ids_train=encoded_data_train['input_ids']
attention_mask_train=encoded_data_train['attention_mask']
label_train=torch.tensor(ytrain.values)

In [None]:
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
label_val = torch.tensor(yval.values)

In [None]:
# Pytorch TensorDataset Instance
dataset_train = TensorDataset(input_ids_train,attention_mask_train, label_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val,label_val)

In [None]:

model = transformers.BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=5,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=128)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=128)

In [None]:
from torch.utils.data import DataLoader,RandomSampler,SequentialSampler
dataloader_train=DataLoader(dataset_train,sampler=RandomSampler(dataset_train),batch_size=512)
dataloader_validation=DataLoader(dataset_val,sampler=SequentialSampler(dataset_val),batch_size=128)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
device = torch.device('cuda')

In [None]:
model.to(device)
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                  'labels':         batch[2].to(device),
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')