## Import libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

## Get to know the data

In [None]:
df = pd.read_csv('../input/nlp-getting-started/train.csv')
df.head()

In [None]:
# let's look at what are the unique values are there for the keyword and location columns

df['keyword'].unique()

In [None]:
df['location'].unique()

Since, the keywords can be found in the tweet itself and location isn't really important for our text classification model, we will be removing these two columns along with the id.

In [None]:
df = df.drop(['id', 'keyword', 'location'], axis = 1)

df

In [None]:
# We will seperating the text feature and the target, 
# and then splitting both into train and validation sets

X = df['text']
y = df['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.25)

X_train.head()

## The NLP Pipeline

In [None]:
# Initialize all the preprocessing objects

tokenizer = RegexpTokenizer(r"\w+") # only select alphanumeric characters
en_stop = set(stopwords.words('english')) # get all the English language stopwords
ps = PorterStemmer() # to extract stem out of any given word

In [None]:
def getStemmedTweet(tweet):
    """
        This function takes the tweet string and then performs the preprocessing steps on it
        to return the cleaned tweet which will be more effective in predictions later made by the 
        classifier.
    """
    tweet = tweet.lower()
    
    tokens = tokenizer.tokenize(tweet)
    new_tokens = [token for token in tokens if token not in en_stop]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    
    return cleaned_review

In [None]:
# Let's check out the results of the function 
rand_num = 34
print("Review ===> ", X_train[rand_num])
print("Preprocessed Review ===>", getStemmedTweet(X_train[rand_num]))

As we can see, the preprocessed review is much more shorter, and conveys the same meaning as the original tweet.

In [None]:
# Apply the preprocessing pipeline function on the whole dataset
X_cleaned = X_train.apply(getStemmedTweet)
Xval_cleaned = X_val.apply(getStemmedTweet)

## Define the classifier

In [None]:
## First of all though, we'll need to convert our data into a count vector to be able 
## to work with the Multinomial Naive Bayes model

cv = CountVectorizer()

X_vec = cv.fit_transform(X_cleaned).toarray()
Xval_vec = cv.transform(Xval_cleaned).toarray()

print(X_vec.shape)
print(Xval_vec.shape)

In [None]:
# Train the classifier

mnb = MultinomialNB()
mnb.fit(X_vec, y_train)

In [None]:
## Test the accuracy of our classifier on the validation set

mnb.score(Xval_vec, y_val)

## PREDICTION TIME!

In [None]:
# get the test dataset
test = pd.read_csv('../input/nlp-getting-started/test.csv')

# get the main 'text' column
test_ids = test['id']
test = test['text']

# apply the preprocessing pipeline
test = test.apply(getStemmedTweet)

# creating the count vectors from the dataset
test_vec = cv.transform(test).toarray()

predictions = mnb.predict(test_vec)

In [None]:
predictions = pd.Series(predictions)
ids = pd.Series(test_ids)

pred_df = pd.concat([ids, predictions], keys = ['id', 'target'], axis = 1)

pred_df.to_csv('submission.csv',index = False)