<a href="https://colab.research.google.com/github/saivarun08777/saivarun08777/blob/main/nltk_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import os
from transformers import BertTokenizer, TFBertModel, TFAutoModel,AutoTokenizer

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy()
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
import warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/nltk/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/nltk/test.csv')

In [None]:
train_data.shape, test_data.shape

In [None]:
train_data.info(), test_data.info()

In [None]:
train_data.describe

In [None]:
test_data.describe

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.columns

In [None]:
test_data.columns

In [None]:
train_data.dtypes

In [None]:
test_data.dtypes

In [None]:
train_data['lang_abv'].unique()

In [None]:
train_data['language'].unique()

In [None]:
train_data['label'].unique().sum()

In [None]:
train_data['label'].unique()

In [None]:
test_data['lang_abv'].unique()

In [None]:
test_data['language'].unique()

In [None]:
train_data=train_data.drop('lang_abv',axis=1)
test_data=test_data.drop('lang_abv',axis=1)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
sns.countplot(train_data['language'], palette='bright')
sns.set(rc={'figure.figsize':(15,15)})

In [None]:
sns.countplot(train_data['label'], palette='deep')

In [None]:
sns.countplot(test_data['language'], palette='bright')
sns.set(rc={'figure.figsize':(5,5)})

In [None]:
labels, frequencies = np.unique(train_data.language.values, return_counts = True)

plt.figure(figsize = (10,10))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()

In [None]:
labels, frequencies = np.unique(test_data.language.values, return_counts = True)

plt.figure(figsize = (10,10))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()

In [None]:
pip install sentencepiece

In [None]:
pip install tokenizer

In [None]:
pip install backend tokenizer

In [None]:
pip install slow tokenizer

In [None]:
pip install fast tokenizer

In [None]:
pip install serialization

In [None]:
pip install tokenizers

In [None]:
import tokenizer

In [None]:
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
len(tokenizer.vocab)

In [None]:
def encode_sentence(s):
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
def bert_encode( premises,hypotheses,tokenizer):
    
    num_examples = len(hypotheses)

    sentence1 = tf.ragged.constant([
        encode_sentence(s)
        for s in np.array(hypotheses)])
    sentence2 = tf.ragged.constant([
        encode_sentence(s)
        for s in np.array(premises)])

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

    input_mask = tf.ones_like(input_word_ids).to_tensor()

    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(sentence1)
    type_s2 = tf.ones_like(sentence2)
    input_type_ids = tf.concat(
        [type_cls, type_s1, type_s2], axis=-1).to_tensor()

    inputs = {
        'input_word_ids': input_word_ids.to_tensor(),
        'input_mask': input_mask,
        'input_type_ids': input_type_ids}

    return inputs

In [None]:
def build_model():
    bert_encoder = TFBertModel.from_pretrained(model_name)
    input_word_ids = tf.keras.Input(shape = (None,),dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape = (None,),dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape = (None,),dtype=tf.int32, name="input_type_ids")
    
    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
train_input = bert_encode(train_data.premise.values, train_data.hypothesis.values, tokenizer)

In [None]:
def build_model():
    bert_encoder = TFBertModel.from_pretrained(model_name)
    input_word_ids = tf.keras.Input(shape = (None,),dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape = (None,),dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape = (None,),dtype=tf.int32, name="input_type_ids")
    
    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
with strategy.scope():
    model = build_model()
    model.summary()

In [None]:
model.fit(train_input, train_data.label.values, epochs = 5, verbose = 1, batch_size = 16, validation_split = 0.2)

In [None]:
test_input = bert_encode(test_data.premise.values, test_data.hypothesis.values, tokenizer)

In [None]:
test_data.head()

In [None]:
predictions = [np.argmax(i) for i in model.predict(test_input)]

In [None]:
pred_data = test_data.id.copy().to_frame()
pred_data['prediction'] = predictions

In [None]:
pred_data.head(10)

In [None]:
pred_data.to_csv("pred_data.csv", index = False)

In [None]:
sns.countplot(pred_data['prediction'], palette='bright')