In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading the train and test files

In [None]:
train_df = pd.read_csv("/kaggle/input/ag-news-classification-dataset/train.csv")
print(train_df.shape)

test_df = pd.read_csv("/kaggle/input/ag-news-classification-dataset/test.csv")
print(test_df.shape)

## Missing values analysis

In [None]:
train_df.info()

In [None]:
test_df.info()

### Great nothing is missing!!!


### Target variable 

### There are 4 types of news 

### The class ids are numbered 1-4 where 
#### 1 represents World, 2 represents Sports, 3 represents Business and 4 represents Sci/Tech.

In [None]:
train_df['Class Index'].value_counts()

## preparing training data

### Please note - we are doing a baseline using PyTorch and BERT and hence we are not doing any text preprocessing

In [None]:
df = pd.DataFrame()
df['text'] = train_df['Description']
df['label'] = train_df['Class Index']

### Since the labels are starting from 1 to N - we need to map it to 0 to N-1 , so that we dont get "CUDA error: device-side assert triggered"

In [None]:
df['label'] = df['label'].apply(lambda x : x -1)
df['label'].value_counts()

## Install simple transformers

In [None]:
pip install simpletransformers

## Configure the simple transformer for classificating the text

### select the bert model you want to train
### Hardware Accelerator 
### No of labels  - in our case it is 4 

In [None]:
from simpletransformers.classification import ClassificationModel

model = ClassificationModel('bert', 'bert-base-cased', num_labels=4,
args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=True)

### Lets begin our training of the model on a smaller dataset

In [None]:
# 20 % of original dataset
small_df = df.sample(frac =0.5)

In [None]:
%%time
model.train_model(small_df)

### Lets prepare our evalution data set

In [None]:
eval_df = pd.DataFrame()
eval_df['text'] = test_df['Description']
eval_df['label'] = test_df['Class Index']

eval_df['label'] = eval_df['label'].apply(lambda x : x -1)
eval_df['label'].value_counts()

### Evaluate the model

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

In [None]:
print(result)
print(model_outputs)

In [None]:
lst = []
for arr in model_outputs:
    lst.append(np.argmax(arr))
true = eval_df['label'].tolist()
predicted = lst

In [None]:
import sklearn
print(sklearn.metrics.classification_report(true,predicted,target_names=['World','Sports','Business','Sci/Tech']))


### Since this is a balanced dataset  - Accuracy is a good measure of model performance

In [None]:
sklearn.metrics.accuracy_score(true,predicted)