##Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Importing Modules

In [None]:
try:
  %tensorflow_version 2.x  #gpu
except Exception:
  pass
  
import tensorflow as tf

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `2.x  #gpu`. This will be interpreted as: `2.x`.


TensorFlow is already loaded. Please restart the runtime to change versions.


In [None]:
import os
import re
import pandas as pd

##Loading & Splitting The Data

In [None]:
train = pd.read_csv("/content/drive/MyDrive/BERT_canadian_AI/stack_overflow_10000.csv")

In [None]:
train.shape

(10000, 7)

In [None]:
train.head(6)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Id,Title,Body,Tags,single_label
0,536774,536774,31941610,Importing project from Eclipse to Android Stud...,When I import the whole project from eclipse t...,['android'],android
1,459695,459695,27882300,"How to set min x, min y, max y, and min y MPAn...",From\n\nhttps://github.com/PhilJay/MPAndroidCh...,['android'],android
2,212462,212462,14277730,How to delete a Draft from google play Develop...,I have a test draft on Google Play developer c...,['android'],android
3,582924,582924,34232590,Android have a style add properties instead of...,"Probably not a very clear question, but here's...",['android'],android
4,245908,245908,16224580,Google Endpoints generated libraries for JAVA ...,I have strange problem with generating process...,['android'],android
5,123527,123527,9139860,can't find com.android.future.usb.accessory.jar,I'am looking for this file to work with Androi...,['android'],android


In [None]:
train.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Id', 'Title', 'Body', 'Tags',
       'single_label'],
      dtype='object')

In [None]:
train.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Id', 'Title','Tags'], axis = 1, inplace = True)

In [None]:
train.shape

(10000, 2)

In [None]:
label_dict ={}
label_dict['android'] = 0
label_dict['c#'] = 1
label_dict['c++'] = 2
label_dict['html'] = 3
label_dict['ios'] = 4
label_dict['java'] = 5
label_dict['javascript'] = 6
label_dict['jquery'] = 7
label_dict['php'] = 8
label_dict['python'] = 9

In [None]:
train['single_label'] = train['single_label'].map(label_dict)

In [None]:
train.rename(columns={'Body': 'text', 'single_label': 'labels'}, inplace=True)

In [None]:
#Reducing the training sample for fast execution
#train = train.sample(frac = 0.02)

#splitting the training set in to training and validation sets
from sklearn.model_selection import train_test_split
train, val =  train_test_split(train, test_size = 0.2, random_state = 120)

In [None]:
train.shape

(8000, 2)

In [None]:
val.shape

(2000, 2)

In [None]:
train.head(5)

Unnamed: 0,text,labels
7432,I can pass in multiple ajax calls into a when ...,7
8257,I've recently decided to get interested in ser...,8
9995,I have a large binary file of ieee 32bit float...,9
6079,I'm using tooltips for a button and it's menu ...,6
4791,I am new to iphone development. I want to shar...,4


##Installing & Importing Simple Transformers

In [None]:
!pip install simpletransformers



## Creating A Classification Model

In [None]:
from simpletransformers.classification import ClassificationModel

#Create a ClassificationModel
model = ClassificationModel('roberta', 'roberta-base', num_labels=10, use_cuda = False)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

##Training the Classifier

In [None]:
model.train_model(train)

  0%|          | 0/8000 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1000 [00:00<?, ?it/s]

(1000, 1.0041803017295896)

##Evaluating The Classifier

In [None]:
scores1, model_outputs, wrong_predictions = model.eval_model(val)

  0%|          | 0/2000 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/250 [00:00<?, ?it/s]

In [None]:
model_outputs

array([[-1.86529636,  0.04579132, -2.11077809, ...,  3.07998586,
        -0.98661494, -1.16007519],
       [-1.65221989, -0.53846371, -1.39697397, ...,  1.43221617,
        -0.36308545, -0.5008322 ],
       [-0.84825075, -1.50272834,  0.13756114, ..., -1.05420613,
        -0.36241573,  5.91984415],
       ...,
       [-1.12419724, -0.91993934, -1.96546006, ...,  5.66768551,
        -0.25085485, -0.78909576],
       [-0.99710053, -1.44577193,  0.08364326, ..., -1.04882705,
        -0.41716114,  5.91843033],
       [ 0.12693605,  5.0707674 , -1.11617589, ..., -0.50537366,
        -0.73202181, -2.02522254]])

In [None]:
import numpy as np

In [None]:
scores1

{'eval_loss': 0.20702565842881226, 'mcc': 0.9280285195386848}

#### Evaluation Report

In [None]:
from sklearn.metrics import classification_report
print(classification_report(val['labels'].values,np.argmax(model_outputs,axis=1)))

              precision    recall  f1-score   support

           0       0.91      0.74      0.82       213
           1       0.69      0.77      0.73       196
           2       0.81      0.89      0.85       202
           3       0.79      0.83      0.81       198
           4       0.85      0.81      0.83       220
           5       0.79      0.80      0.79       196
           6       0.74      0.64      0.68       202
           7       0.84      0.75      0.79       193
           8       0.76      0.90      0.82       191
           9       0.86      0.88      0.87       189

    accuracy                           0.80      2000
   macro avg       0.80      0.80      0.80      2000
weighted avg       0.80      0.80      0.80      2000

