## Import Section

In [None]:
import os
import re
import numpy as np 
import pandas as pd 

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.feature_extraction.text import (CountVectorizer, 
                                             TfidfVectorizer, 
                                             TfidfTransformer)

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers

## Load training and test data sets

In [None]:
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train_df.head()

In [None]:
# Check the shape of the training and test data sets
print("Shape of training data:", train_df.shape)
print("Shape of test data:", test_df.shape)

In [None]:
# Check the number of null values in the training data
train_df.isnull().sum()

In [None]:
# Check the number of null values in the test data
test_df.isnull().sum()

### Drop location column as it contains a lot of null values

In [None]:
train_df.drop('location', axis = 1, inplace = True)
test_df.drop('location', axis = 1, inplace = True)

In [None]:
# Display few tweets from training data set
print(train_df['text'][:10].values)

### Clean text by removing
- Hash tag
- Punctuation marks
- Lower case all the text 

In [None]:
# substitute url with a placeholder in the text messages
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def url_holder(text):
    """
    This function will receive text message as input and will
    substitute url with a placeholder
    
    Args: 
        text: text message 
    """
    # find all the urls in the text message
    detected_url = re.findall(url_regex, text)
    
    # iterate over the urls and substitute with the placeholder
    for url in detected_url:
        text = text.replace(url, 'url')
    
    return text

### Add some new features to the data

In [None]:
def add_features(df):
    """
    This function will create additional features to improve the performace
    of the model. Features such as length of the message, number of words, 
    number of non stopwords and average word length in each message will be
    created by this method.
    
    Args: 
        df: original dataframe
        
    Returns:
        df: dataframe with new added features
    """
    # create a set of stopwords
    StopWords = set(stopwords.words('english'))
    
    # substitute url with the placeholder in the text message
    train_df['text'] = train_df['text'].apply(url_holder)
    
    # lowering and removing punctuation
    df['processed_text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))
    
    # apply lemmatization
    df['processed_text'] = df['processed_text'].apply(
        lambda x: ' '.join([WordNetLemmatizer().lemmatize(token) for token in x.split()]))
    
    # get length of the message
    df['length'] = df['processed_text'].apply(lambda x: len(x))
    
    # get number of words in each message
    df['num_words'] = df['processed_text'].apply(lambda x: len(x.split()))
    
    # get the number of non stopwords in each message
    df['non_stopwords'] = df['processed_text'].apply(
        lambda x: len([t for t in x.split() if t not in StopWords]))
    
    # get the average word length
    df['avg_word_len'] = df['processed_text'].apply(
        lambda x: np.mean([len(t) for t in x.split() if t not in StopWords]) \
        if len([len(t) for t in x.split() if t not in StopWords]) > 0 else 0)
    
    # update stop words (didn't want to remove negation)
    StopWords = StopWords.difference(
        ["aren't", 'nor', 'not', 'no', "isn't", "couldn't", "hasn't", "hadn't", "haven't",
         "didn't", "doesn't", "wouldn't", "can't"])
    
    # remove stop words from processed text message
    df['processed_text'] = df['processed_text'].apply(
        lambda x: ' '.join([token for token in x.split() if token not in StopWords]))
        
    # filter the words with length > 2
    df['processed_text'] = df['processed_text'].apply(
        lambda x: ' '.join([token for token in x.split() if len(token) > 2]))
    
    return df

In [None]:
train_df = add_features(train_df)
train_df.head()

In [None]:
# display few processed text messages
print(train_df['processed_text'][:10].values)

In [None]:
# fill null values with the most frequent in keyword column
train_df['keyword'] = train_df['keyword'].fillna('0')
test_df['keyword'] = test_df['keyword'].fillna('0')

### Split data into train and validation sets

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train_df.drop('target', axis = 1).iloc[:, 1:],
                                                  train_df['target'].values,
                                                  test_size = 0.2, 
                                                  stratify = train_df['target'].values, 
                                                  random_state = 42)

# print the shape of the training and validation sets
print(f'x_train shape: {x_train.shape}\ny_train shape: {y_train.shape}')
print(f'x_val shape: {x_val.shape}\ny_val shape: {y_val.shape}')

In [None]:
class TextColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations.
    This class will select columns containing text data.
    """
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        return X[self.key]
    
    

class NumColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations.
    This class will select the columns containing numeric data.
    """
    def __init__(self, key):
        self.key = key
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        return X[[self.key]]

### Scikit Learn Pipeline

In [None]:
# create separate pipelines to process individual features

# pipeline to process num_words column
num_words = Pipeline([
    ('selector', NumColumnSelector(key = 'num_words')),
    ('scaler', StandardScaler())
])

# pipeline to process non_stopwords column
num_non_stopwords = Pipeline([
    ('selector', NumColumnSelector(key = 'non_stopwords')),
    ('scaler', StandardScaler())
])

# pipeline to process avg_word_len column
avg_word_length = Pipeline([
    ('selector', NumColumnSelector(key = 'avg_word_len')),
    ('scaler', StandardScaler())
])

# pipeline to process processed_text column
message_processing = Pipeline([
    ('selecor', TextColumnSelector(key = 'processed_text')),
    ('tfidf', TfidfVectorizer())
])


# pipeline to process length column
length = Pipeline([
    ('selector', NumColumnSelector(key = 'length')),
    ('scaler', StandardScaler())
])


# pipeline to process keyword column
counter = Pipeline([
    ('selector', TextColumnSelector(key = 'keyword')),
    ('counter', CountVectorizer())
])

### Feature Union to combine data processing

In [None]:
# process all the pipelines in parallel using feature union
feature_union = FeatureUnion([
    ('num_words', num_words),
    ('num_non_stopwords', num_non_stopwords),
    ('avg_word_length', avg_word_length),
    ('message_processing', message_processing),
    ('length', length),
    ('counter', counter)
])


# create final pipeline to train the classifier
final_pipeline = Pipeline([
    ('feature_union', feature_union),
    ('clf', RandomForestClassifier())
])

# fit the pipeline on trainig data
final_pipeline.fit(x_train, y_train)

### Evaluate performance on validation data

In [None]:
# calculate accuracy on validation data
y_pred = final_pipeline.predict(x_val)
print(f'Accuracy on validation data: {accuracy_score(y_val, y_pred)}')

### Hyperparameter tuning

In [None]:
# get the parameters of final pipeline
final_pipeline.get_params().keys()

In [None]:
# prepare dictionary of parameters
parameters = {'feature_union__message_processing__tfidf__max_df': [0.5, 0.75, 1.0],
              'feature_union__message_processing__tfidf__ngram_range': [(1, 1), (1, 2)],
              'feature_union__message_processing__tfidf__use_idf': [True, False],
              'clf__n_estimators': [200, 400],
              'clf__max_features': ['auto', 'sqrt', 'log2'],
             }


# create GridSearchCV object
grid_cv = GridSearchCV(final_pipeline, parameters, cv = 5, n_jobs = -1)

# Fit and tune the model
grid_cv.fit(x_train, y_train)

In [None]:
# display the best parameters
grid_cv.best_params_

In [None]:
# refitting on entire training data using best settings
grid_cv.refit

# calculate accuracy on validation data
y_pred = grid_cv.predict(x_val)
print(f'Accuracy on validation data: {accuracy_score(y_val, y_pred)}')

### Prepare test data for making predictions

In [None]:
test_df = add_features(test_df)
test_df.head()

In [None]:
# extract test features
test_features = test_df.iloc[:, 1:]
test_preds = grid_cv.predict(test_features)

# create submission file
submission = {'id': test_df.id, 'target': test_preds}
submission = pd.DataFrame(submission)
submission.head()

In [None]:
# save as csv file
submission.to_csv('pipeline.csv', index = None)