In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Part of the code is taken from https://www.kaggle.com/mohitsital/0-80777-simplest-model-naive-bayes/data.

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
pd.set_option('display.max_rows', None)

Here is a function for transforming the tweet to a more accessible format.

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
def process_tweet(tweet):
    tweet2 = re.sub(r'^RT[\s]+', '', tweet)
    tweet2 = re.sub('https?:\/\/.*[\r\n]*', '', tweet2)
    tweet2 = re.sub(r'#', '', tweet2)
    tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)
    tweet_tokens = tokenizer.tokenize(tweet2)
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords.words('english') and word not in string.punctuation):
            tweets_clean.append(word)
    stemmer = PorterStemmer()
    tweets_stem = []
    for word in tweets_clean:
        stem_word = stemmer.stem(word)
        tweets_stem.append(stem_word)
    return " ".join(tweets_stem)

In [None]:
print(process_tweet('forest%20fire'))

And these functions deal with the keyword and location. I have surveyed these data fields to see what changes need to be made (eg relating 'US' to 'United States').

In [None]:
#Cleans keyword.
def process_keyword(keyword):
    keyword_arr = []
    keywords = keyword.split('%20')
    stemmer = PorterStemmer()
    for word in keywords:
        word = word.lower()
        if word not in stopwords.words('english') and word not in string.punctuation:
            keyword_arr.append(stemmer.stem(word))
    return " ".join(keyword_arr)

In [None]:
print(process_keyword('evacuation'))

In [None]:
#Cleans location.
def process_location(location):
    #Replace short-hand
    dictionary = {'United States': 'US', 'New York': 'NYC', 'Los Angeles': 'LA', 'D.C.': 'DC', 
                  'United Kingdom': 'UK', 'USA': 'US', 'Planet': '', 'California': 'CA', 
                  'New York City': 'NYC', 'Texas': 'TX', 'San Diego': 'SanDiego', 'South Africa': 'SouthAfrica', 
                  'Tennessee': 'TN', 'New Jersey': 'NJ'}
    for i in dictionary.keys():
        location = location.replace(i, dictionary[i])
    locations = location.replace(',', '').split()
    ret_arr = []
    for loc in locations:
        loc2 = loc.lower()
        ret_arr.append(loc2)
    return " ".join(ret_arr)

In [None]:
print(process_location('Los Angeles, CA'))

We fill the NA data and treat it as a keyword or location, since there are quite a lot of NAs out there.

In [None]:
train = train.fillna('nan')
test = test.fillna('nan')

In [None]:
for i in range(train.shape[0]):
    train.loc[i, 'keyword'] = process_keyword(train.loc[i, 'keyword'])
    train.loc[i, 'location'] = process_location(train.loc[i, 'location'])
    train.loc[i, 'text'] = process_tweet(train.loc[i, 'text'])

In [None]:
for i in range(test.shape[0]):
    test.loc[i, 'keyword'] = process_keyword(test.loc[i, 'keyword'])
    test.loc[i, 'location'] = process_location(test.loc[i, 'location'])
    test.loc[i, 'text'] = process_tweet(test.loc[i, 'text'])

We first try tf_idf to vectorise the data.

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
tf_idf_vect = TfidfVectorizer(ngram_range=(1,3)) # one,two and three gram vectorization
df_train, df_val = train_test_split(train)
freqs_train = tf_idf_vect.fit_transform(df_train['text'].values)
freqs_val = tf_idf_vect.transform(df_val['text'].values)

In [None]:
tf_idf_key = TfidfVectorizer()
tf_idf_loc = TfidfVectorizer()
key_train = tf_idf_key.fit_transform(df_train['keyword'].values)
loc_train = tf_idf_loc.fit_transform(df_train['location'].values)
key_val = tf_idf_key.transform(df_val['keyword'].values)
loc_val = tf_idf_loc.transform(df_val['location'].values)

In [None]:
print(freqs_train)

Now we build a naive bayes classifier on each indicator: text, key and location.

In [None]:
from sklearn.naive_bayes import MultinomialNB
y_train = df_train['target'].values
nb_text = MultinomialNB().fit(freqs_train, y_train)
nb_key = MultinomialNB().fit(key_train, y_train)
nb_loc = MultinomialNB().fit(loc_train, y_train)

In [None]:
from sklearn.metrics import classification_report
def printreport(exp, pred):
    print(pd.crosstab(exp, pred, rownames=['Actual'], colnames=['Predicted']))
    print('\n \n')
    print(classification_report(exp, pred))

Essentially we calculate the log probabilities for each classifier and try to combine them. We do a grid search to find the best parameters for each naive bayes classifier.

In [None]:
from sklearn.metrics import f1_score
y_val = df_val['target'].values
text_weighting_arr = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2, 1.3, 1.4, 1.5]
key_weighting_arr = [0, .05, .1, .15, .2, .25, .3, .35, .4, .45, .5, .55, .6, .65, .7, .75, .8]
result = np.zeros((len(text_weighting_arr), len(key_weighting_arr)))
for i in range(len(text_weighting_arr)):
    for j in range(len(key_weighting_arr)):
        if (text_weighting_arr[i] + key_weighting_arr[j] <= 1):
            predicted_proba = text_weighting_arr[i] * nb_text.predict_log_proba(freqs_val) + key_weighting_arr[j] * nb_key.predict_log_proba(key_val) + (1-text_weighting_arr[i]-key_weighting_arr[j]) * nb_loc.predict_log_proba(loc_val)
            predicted = np.argmax(predicted_proba, axis = 1)
            result[i,j] = f1_score(y_val, predicted)
#print(result)

In [None]:
predicted_proba = .6 * nb_text.predict_log_proba(freqs_val) + .25 * nb_key.predict_log_proba(key_val) + .15 * nb_loc.predict_log_proba(loc_val)
predicted = np.argmax(predicted_proba, axis = 1)
y_val = df_val['target']
#printreport(y_val, predicted)
#print(f1_score(y_val, predicted))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(1,3))
df_train_2, df_val_2 = train_test_split(train)
freqs_train_2 = count_vect.fit_transform(df_train_2['text'].values)
freqs_val_2 = count_vect.transform(df_val_2['text'].values)

In [None]:
from sklearn.naive_bayes import MultinomialNB
y_train_2 = df_train_2['target'].values
nb_2 = MultinomialNB().fit(freqs_train_2, y_train_2)

In [None]:
predicted_2 = nb_2.predict(freqs_val_2)
y_val_2 = df_val_2['target'].values
printreport(y_val_2, predicted_2)

Now we train the Naive Bayes model on the whole training set.

In [None]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,3)) # one,two and three gram vectorization
tf_idf_key = TfidfVectorizer()
tf_idf_loc = TfidfVectorizer()
freqs_train = tf_idf_vect.fit_transform(train['text'].values)
key_train = tf_idf_key.fit_transform(train['keyword'].values)
loc_train = tf_idf_loc.fit_transform(train['location'].values)
freqs_test = tf_idf_vect.transform(test['text'].values)
key_test = tf_idf_key.transform(test['keyword'].values)
loc_test = tf_idf_loc.transform(test['location'].values)

In [None]:
y_train = train['target'].values
nb_text = MultinomialNB().fit(freqs_train, y_train)
nb_key = MultinomialNB().fit(key_train, y_train)
nb_loc = MultinomialNB().fit(loc_train, y_train)

In [None]:
#predicted_proba = .6 * nb_text.predict_log_proba(freqs_test) + .25 * nb_key.predict_log_proba(key_test) + .15 * nb_loc.predict_log_proba(loc_test)
#predicted = np.argmax(predicted_proba, axis = 1)
predicted = nb_text.predict(freqs_test)

In [None]:
#Make submission file
submission_df = pd.DataFrame()
submission_df['id'] = test['id']
submission_df['target'] = predicted
print(submission_df.head())

In [None]:
submission_df.to_csv('submission.csv',index=False)