In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()


train=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
bert_train=train   # later we are going to use this for BERT 

train.sample(10)

In [None]:
train.describe()

In [None]:
for col in train.columns:
    print(col + " column has: " + str(round((train[col].isna().sum()/train[col].isna().count())*100, 2))+"% Missing values")

In [None]:
train.drop(['keyword','location'], inplace= True, axis=1)
train.head(1)

In [None]:
train['target'].value_counts().plot(kind='bar')

In [None]:

from bs4 import BeautifulSoup


# Data Cleaning

In [None]:
# Text cleaning
import nltk
from bs4 import BeautifulSoup

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer 
import re
from tqdm import tqdm

def text_cleaning(text):
    # change the text into lower case.(Note: in case of social media text, it is good to leave them as it is!)
    text=text.lower()
    
    # removing xml tags from tweets
    text=BeautifulSoup(text, 'lxml').get_text()
    
    # removing URLS 
    text=re.sub('https?://[A-Za-z0-9./]+','',text)
    
    # removing words with "@"
    text=re.sub(r'@[A-Za-z0-9]+','',text)
    
    # removing special characters
    text= re.sub(r"\W+|_", ' ', text)
    
    # tokenization of sentences
    text= word_tokenize(text)
    
    # lemmatize the text using WordNet
    lm=WordNetLemmatizer()
    words = [lm.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]   
    
    
    return " ".join(words)

In [None]:
train.text=train['text'].progress_apply(text_cleaning)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(ngram_range=(1, 1),
    max_df=1.0,
    min_df=1,)
X=tfidf.fit_transform(train['text'])
print(X.shape)


# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf=TfidfVectorizer(ngram_range=(1, 2),
#     max_df=1.0,
#     min_df=1,)

In [None]:
y=train.target

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Logistic Regression:

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr=LogisticRegression()
lr.fit(X_train, y_train)

y_pred=lr.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

lr=LogisticRegression()

kfold=KFold(n_splits=5, random_state=42, shuffle=True)
cv_results = cross_val_score(lr, X, y, cv=kfold, scoring='accuracy',)   # in case cv value is interger, it will automaticallu take KFold or StratifiedKFold 
cv_results.mean()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
params= {'C': [0.001, 0.01, 0.1, 1, 10, 100, ] , 'penalty':['l1','l2']}
RS_lr = RandomizedSearchCV(lr,params,return_train_score=True,error_score=0,random_state=42,cv=kfold)
RS_lr.fit(X_train, y_train)

print(RS_lr.best_estimator_)
print(RS_lr.best_params_)
print(RS_lr.best_score_)

In [None]:
from sklearn.model_selection import GridSearchCV
GS_lr=GridSearchCV(lr, params,cv=kfold)
GS_lr.fit(X_train, y_train)

print(GS_lr.best_estimator_)
print(GS_lr.best_params_)
print(GS_lr.best_score_)

# Support Vector Machine: SVC

In [None]:
%%time 
from sklearn.svm import SVC
model=SVC()
model.fit(X_train, y_train)

y_pred=model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
%%time 
params= {'C': [0.001, 0.01, 0.1, 1, 10, 100, ] , 'kernel':['linear','rbf'], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],}
RS_svm = RandomizedSearchCV(model,params,return_train_score=True,error_score=0,random_state=42,cv=kfold)
RS_svm.fit(X_train, y_train)

print(RS_svm.best_estimator_)
print(RS_svm.best_params_)
print(RS_svm.best_score_)

In [None]:
%%time 
GS_svm=GridSearchCV(model, params,cv=kfold,)
GS_svm.fit(X_train, y_train)

print(GS_svm.best_estimator_)
print(GS_svm.best_params_)
print(GS_svm.best_score_)

# Tree based model: RandomForestClassifier

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier( random_state=42)
model.fit(X_train, y_train)

y_pred=model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
# %%time

# params={'bootstrap': [True, False],
#  'max_depth': [10, 20, ],
#  'max_features': ['auto', 'sqrt'],
#  'min_samples_leaf': [1, 2, 4],
#  'min_samples_split': [2, 5, 10],
#  'n_estimators': [100,200, 250, 300]}


# RS_rf= RandomizedSearchCV(model, params, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)  # it will try 100 different combinations and all the cores available
# RS_rf.fit(X_train, y_train)

# print(RS_rf.best_estimator_)
# print(RS_rf.best_params_)
# print(RS_rf.best_score_)

#  BERT(w/ Huggingface)

In [None]:
import transformers
import tensorflow as tf
from tqdm import tqdm

In [None]:
# Loading the BERT Classifier and Tokenizer along with Input module
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:

def text_cleaning(text):
    # change the text into lower case.(Note: in case of social media text, it is good to leave them as it is!)
    # text=text.lower()

    # removing xml tags from tweets
    text=BeautifulSoup(text, 'lxml').get_text()

    # removing URLS 
    text=re.sub('https?://[A-Za-z0-9./]+','',text)

    # removing words with "@"
    text=re.sub(r'@[A-Za-z0-9]+','',text)

    # removing special characters
    text= re.sub(r"\W+|_", ' ', text)

    # tokenization of sentences
    text= word_tokenize(text)

    # lemmatize the text using WordNet
    lm=WordNetLemmatizer()
    words = [lm.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]   

    
    return " ".join(words)
bert_train=train
bert_train.text=bert_train['text'].progress_apply(text_cleaning)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bert_train['text'], bert_train['target'], test_size=0.25, random_state=42)

In [None]:
def convert_data_to_examples(X_train, X_test, y_train, y_test): 
    train_InputExamples=[]
    validation_InputExamples=[]
    
    for index in X_train.index:
        train_InputExamples.append (InputExample(guid=None,text_a =X_train[index],label = y_train[index] ))
    for index in X_test.index:
        validation_InputExamples.append(InputExample(guid=None,  text_a = X_test[index], label = y_test[index]))
    
    return train_InputExamples, validation_InputExamples

In [None]:
%%time
train_InputExamples, validation_InputExamples = convert_data_to_examples(X_train, X_test, y_train, y_test)

In [None]:
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in tqdm(examples):
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,    # Add 'CLS' and 'SEP'
            max_length=max_length,    # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],input_dict["token_type_ids"], input_dict['attention_mask'])
        features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label) )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


In [None]:
train_data = convert_examples_to_tf_dataset(train_InputExamples, tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(validation_InputExamples, tokenizer)
validation_data = validation_data.batch(32)


In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)

In [None]:
test= pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
test['text']=test['text'].apply(text_cleaning)
test_=[]
for i in range (0, len(test['text'])):
    test_.append(test['text'][i])
tf_batch = tokenizer(test_, max_length=128, padding=True, truncation=True, return_tensors='tf')   
tf_outputs = model(tf_batch)    

In [None]:
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1) 
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()

In [None]:
# labels=[0,1]
# for i in range(len(test_)):
#     print(test_[i], ": \n", labels[label[i]])

In [None]:
submission=pd.DataFrame()
submission['id']=test['id']
submission['target'] = label
submission.to_csv('/kaggle/working/submission.csv', index=False)