In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## NLP Text Classification Model

It is a supervised machine learning model that predicts which Tweets are about real disasters and which one’s aren’t.

Steps involved in development of text classification model are below :-

1. Importing Libraries
2. Loading the data set & perform Exploratory Data Analysis
3. Text pre-processing
4. Split the train dataset
4. Extracting vectors from text (Vectorization) using Bag-of-Words(with Tf-Idf) and Word2Vec
5. ML Model algorithms
6. Testing ML models on test dataset



### Import packages

In [None]:
#for data visualization
import seaborn as sns
import matplotlib.pyplot as plt

#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer


#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#for word embedding
import gensim
from gensim.models import Word2Vec

In [None]:
## Load data set into dataframe
df_train=pd.read_csv('../input/nlp-getting-started/train.csv')
df_test = pd.read_csv('../input/nlp-getting-started/test.csv')
print(df_train.shape)
df_train.head(3)

In [None]:
df_test.head(3)

### EDA

In [None]:
## find missing values in df
df_train.isna().sum()

In [None]:
# CLASS distribution
x=df_train['target'].value_counts()
print(x)
sns.barplot(x.index,x)

In [None]:
##Count number of words in a tweet
df_train['word_count'] = df_train['text'].apply(lambda x: len(str(x).split()))
print("Number of words in Disaster tweets:",df_train[df_train['target']==1]['word_count'].mean()) 
print("Number of words in Non-Disaster tweets:",df_train[df_train['target']==0]['word_count'].mean()) 


#Plot word-count per tweet
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,4))
df_train_words=df_train[df_train['target']==1]['word_count']
ax1.hist(df_train_words,color='red')
ax1.set_title('Disaster tweets')
df_train_words=df_train[df_train['target']==0]['word_count']
ax2.hist(df_train_words,color='green')
ax2.set_title('Non-disaster tweets')
fig.suptitle('Words per tweet')
plt.show()

In [None]:
##Count number of character
df_train['char_count'] = df_train['text'].apply(lambda x: len(str(x)))
print("Number of characters in Disaster tweets:",df_train[df_train['target']==1]['char_count'].mean()) 
print("Number of characters in Non-Disaster tweets:",df_train[df_train['target']==0]['char_count'].mean()) 

#Plot word-count per tweet
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,4))
df_train_words=df_train[df_train['target']==1]['char_count']
ax1.hist(df_train_words,color='red')
ax1.set_title('Disaster tweets')
df_train_words=df_train[df_train['target']==0]['char_count']
ax2.hist(df_train_words,color='green')
ax2.set_title('Non-disaster tweets')
fig.suptitle('Character count per tweet')
plt.show()

In [None]:
##Count number of unique word count
df_train['unique_word_count'] = df_train['text'].apply(lambda x: len(set(str(x).split())))
print("Number of unique words in Disaster tweets:",df_train[df_train['target']==1]['unique_word_count'].mean()) 
print("Number of unique words in Non-Disaster tweets:",df_train[df_train['target']==0]['unique_word_count'].mean()) 

### Text Pre-Processing

In [None]:
## Simple text cleaning processe :- Remove punctuations, special characters, URLs, hashtag, leading, trailing & extra white spaces/tabs,typos, slangs are corrected.
text = "   This is a message to be cleaned. It may involve some things like: <br>, ?, :, ''  adjacent spaces and tabs     .  "

#convert to lowercase and remove punctuations and characters and then strip
def preprocess(text):
    text = text.lower() #lowercase text
    text=text.strip()  #remove leading/trailing whitespace 
    text=re.compile('<.*?>').sub('', text) #Remove HTML tags/markups
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  #Replace punctuation with space. 
    text = re.sub('\s+', ' ', text)  #Remove extra space and tabs
    text = re.sub(r'\[[0-9]*\]',' ',text) #[0-9] matches any digit (0 to 10000...)
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) #matches any digit from 0 to 100000..., \D matches non-digits
    text = re.sub(r'\s+',' ',text) #\s matches any whitespace, \s+ matches multiple whitespace, \S matches non-whitespace 
    
    return text

text=preprocess(text)
print(text)  #text is a string

In [None]:
## LEXICON-BASED Text Processing

#1. STOP-WORD Removal
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

text=stopword(text)
print(text)

#2. STEMMING
 
# Initialize the stemmer
snow = SnowballStemmer('english')
def stemming(string):
    a=[snow.stem(i) for i in word_tokenize(string) ]
    return " ".join(a)
text=stemming(text)
print(text)

#3. LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# Helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

text = lemmatizer(text)
print(text)

In [None]:
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
df_train['clean_text'] = df_train['text'].apply(lambda x: finalpreprocess(x))
df_train.head()

#### Word2Vec model
Bag-of-Words (BoW) and Word Embedding (with Word2Vec) are two well-known methods for converting text data to numerical data.

In [None]:
# create Word2vec model

#convert preprocessed sentence to tokenized sentence
df_train['clean_text_tok']=[nltk.word_tokenize(i) for i in df_train['clean_text']] 

#min_count=1 means word should be present at least across all documents,
model = Word2Vec(df_train['clean_text_tok'],min_count=1)  

#combination of word and its vector
w2v = dict(zip(model.wv.index_to_key, model.wv.vectors))  

#for converting sentence to vectors/numbers from word vectors result by Word2Vec
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty should return a vector of zeros
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

### Split the data

In [None]:
##Split the training dataset into test and train data

X_train, X_test, y_train, y_test = train_test_split(df_train["clean_text"],df_train["target"],test_size=0.2,shuffle=True)

In [None]:
# Word2Vec runs on tokenized sentences
X_train_tok= [nltk.word_tokenize(i) for i in X_train]  
X_test_tok= [nltk.word_tokenize(i) for i in X_test]

### Vectorization using Bag-of-Words (with Tf-Idf ) and Word2Vec

In [None]:
#TF-IDF
# Convert x_train to vector since model can only run on numbers and not words- Fit and transform
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
#tfidf runs on non-tokenized sentences unlike word2vec
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
# Only transform x_test (not fit and transform)
X_val_vectors_tfidf = tfidf_vectorizer.transform(X_test)

#Word2vec
# Fit and transform
modelw = MeanEmbeddingVectorizer(w2v)
X_train_vectors_w2v = modelw.transform(X_train_tok)
X_val_vectors_w2v = modelw.transform(X_test_tok)

### ML Models Development

In [None]:
## Classification Model using Logistic Regression(tf-idf)

lr_tfidf=LogisticRegression(solver = 'liblinear', C=1, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  #model

#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_val_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_val_vectors_tfidf)[:,1]

print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('Area under the curve (AUC) :', roc_auc)

In [None]:
## Classification Model using Naive Bayes(tf-idf)

nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_vectors_tfidf, y_train)  

#Predict y value for test dataset
y_predict = nb_tfidf.predict(X_val_vectors_tfidf)
y_prob = nb_tfidf.predict_proba(X_val_vectors_tfidf)[:,1]

print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

In [None]:
## Classification Model using Logistic Regression (W2v)
lr_w2v=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_w2v.fit(X_train_vectors_w2v, y_train)  #model

#Predict y value for test dataset
y_predict = lr_w2v.predict(X_val_vectors_w2v)
y_prob = lr_w2v.predict_proba(X_val_vectors_w2v)[:,1]
 
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

### Testing Model on Test Dataset

In [None]:
## Testing with the best model

#Preprocess the data
df_test['clean_text'] = df_test['text'].apply(lambda x: finalpreprocess(x)) 
X_test=df_test['clean_text'] 
X_vector=tfidf_vectorizer.transform(X_test) 

## Use the trained model on X_vector
y_predict = nb_tfidf.predict(X_vector)      
y_prob = nb_tfidf.predict_proba(X_vector)[:,1]
df_test['predict_prob']= y_prob
df_test['target']= y_predict
print(df_test.head())
final=df_test[['id','target']].reset_index(drop=True)
final.to_csv('NLP_submission.csv')