In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install ktrain
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Simplest code for NLP - Sentiment analysis
We'll look out the most basic code block for **sentiment analysis** problem in **5 easy steps**

For this we are going to rely on **ktrain library** and the **Google's infamous NLP model: BERT**

**As this is the basic guide, please don't expect multiple analysis steps or graphs**

# Step 1: Import libraries, dataset and some basic code cleaning

We'll clean the dataset for special characters and punctuations

In [None]:
import pandas
import ktrain
import re
from ktrain import text

dataset_real = pandas.read_csv('../input/fake-and-real-news-dataset/True.csv')
dataset_fake = pandas.read_csv('../input/fake-and-real-news-dataset/Fake.csv')

dataset_real['category'] = 1
dataset_fake['category'] = 0

dataset = pandas.concat([dataset_real, dataset_fake])
dataset.text = dataset.text.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
dataset.text = dataset.text.apply(lambda x: re.sub('  ', ' ', x))

print(dataset.info())

# Step 2: Preprocess data, split dataset for training and test

We can use any of the ktrain library's below functions based on our needs:
* texts_from_df - to read data from dataframe
* texts_from_folder - to read data from a directory
* texts_from_csv - to read data from a csv file

**Arguments passed:**
* text_column - independent variable/input data which is to be processed
* label_columns - list of dependent variable/output result
* preprocess_mode - preprocess the data to suit which model (takes in 3 params: standard, bert or distilbert)
* maxlen - max features to be set for the model (need to be careful as it utilizes huge resource power)

In [None]:
(xtrain, ytrain), (xtest, ytest), preprocess = text.texts_from_df(dataset, text_column='title', label_columns=['category'], maxlen=128, preprocess_mode='bert')

# Step 3: Create a model

Create a model of your choice, note that the name parameter passed here should match the preprocess_mode set above

In [None]:
model = text.text_classifier(name='bert', train_data=(xtrain, ytrain), preproc=preprocess)

# Step 4: Create a Learner instance

This step creates a learner instance, which is to be trained in the next step
Validation is passed so that the model can validate its prediction on validation set and re-adjusts its feature weights

In [None]:
learner = ktrain.get_learner(model=model, train_data=(xtrain, ytrain), val_data=(xtest, ytest), batch_size=32)

#  Step 5: Train the model

Now train the model by passing learning rate and epochs... its that simple and we are done
I just didnt have patience to wait till it completes so just terminated it, hope you got the jist though

**Note: Since BERT model is very powerful and already comes with huge preloaded data; in single epoch, you'd get accuracy above 90%**
**Only downside to it is takes lot of time as the model is heavy, you can try its lite version distilbert **

To test the prediction, you can use predict method

In [None]:
learner.fit_onecycle(lr=2e-5, epochs=1)