# Import Libraries

In [1]:
import re
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
# Porter stemmer
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
# Snowball stemmer
from nltk.stem import SnowballStemmer
snowball = SnowballStemmer('english')
# Wordnet lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

# Explore textual data

In [1]:
tweets_data = "../input/disaster-tweets/tweets.csv"

In [1]:
# Read data
tweets = pd.read_csv(tweets_data)
tweets.head()

# Clean up Data 

In [1]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [1]:
# apply the preprocess function to all tweets
tweets['text'] = tweets['text'].apply(preprocessor)

In [1]:
X = tweets['text']
y = tweets['target']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Vectorization

## Count Vectorizer

In [1]:
bag_of_words_vectorizer = CountVectorizer(min_df=5)
bow_vectors = bag_of_words_vectorizer.fit_transform(tweets['text'])

In [1]:
#bag_of_words_vectorizer.vocabulary_

In [1]:
# For bag of words
pca = PCA(n_components=2)
x_pca = pca.fit_transform(bow_vectors.todense())
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=tweets['target'],cmap='rainbow')
plt.xlabel('First principal component')
plt.ylabel('Second Principal Component')

## TF-IDF Vectorizer

In [1]:
TFIDF_vectorizer = TfidfVectorizer(min_df=5)
tfidf_vectors = TFIDF_vectorizer.fit_transform(tweets['text'])

In [1]:
#TFIDF_vectorizer.get_feature_names()

In [1]:
# For TFIDF vectors
pca_tfidf = PCA(n_components=2)
x_pca = pca_tfidf.fit_transform(tfidf_vectors.todense())
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=tweets['target'],cmap='rainbow')
plt.xlabel('First principal component')
plt.ylabel('Second Principal Component')

# Pipeline (TFIDF + LR)

In [1]:
def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

def tokenizer_snowball(text):
    return [snowball.stem(word) for word in text.split()]

def tokenizer_wordnet_lemmatizer(text):
    return [lemmatizer.lemmatize(word) for word in text.split()]

TFIDF_vectorizer = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

In [1]:
param_grid = [
    {
        'vect__ngram_range': [(1, 2)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer, tokenizer_porter, tokenizer_snowball, tokenizer_wordnet_lemmatizer],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [1.0, 10.0, 100.0]        
    },
    {
        'vect__ngram_range': [(1, 2)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer, tokenizer_porter, tokenizer_snowball, tokenizer_wordnet_lemmatizer],
        'vect__use_idf': [False],
        'vect__norm': [None],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [1.0, 10.0, 100.0]
    }
]

lr_tfidf = Pipeline([('vect', TFIDF_vectorizer),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy',
                           cv=5, verbose=1, n_jobs=-1)

In [1]:
gs_lr_tfidf.fit(X_train, y_train)

In [1]:
# Get the best parameters
gs_lr_tfidf.best_params_

In [1]:
# Get the best score
gs_lr_tfidf.best_score_

In [1]:
# Determine the score of the best model on the test set (We use here TFIDF vectorizer + LogisticRegression)
clf = gs_lr_tfidf.best_estimator_
clf.score(X_test, y_test)

# Test the Pipeline

In [1]:
print(tweets['text'][0])
print(clf.predict([preprocessor(tweets['text'][0])]))
print('True target: ', tweets['target'][0])