In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Basic notebook showing the steps to use the Real or not? NLP with disaster tweets from Kaggle

In this notebook we will cover the introductory steps required to completed the competition and submit your results. 


Steps to be covered: 

* Import libraries
* Import the data set and do some basic EDA
    * Visualise keywords
* Impliment text preprocessing
    * StopWords
    * Stemming
    * Lemmatization
    * Bag of words
* Create a Bag of words model
* Train a Naive Bayes Model 
* Submit results

### Import the libraries required:

You may need to install some of them such as the nltk and regex which you can using the following commands
For NLTK:  `conda install -c anaconda nltk` 
For Regex: `conda install -c conda-forge regex`

Also make sure you have the `SciKit-Learn`, `Matpplotlib`, `Pandas`, `Numpy` & `Seaborn` libraries installed.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB

### Import the dataset

Using `pd.read_csv()` we will import the training dataset that was downloaded and look and the first five rows using the `.head()` function.

In [None]:
trainDf = pd.read_csv('../input/nlp-getting-started/train.csv')
trainDf.head()

In [None]:
# Here we use the .info() funtion to show some basic information about the dataset such as the datatypes, value counts & column counts.

trainDf.info()

In [None]:
trainDf['target'].value_counts()

In [None]:
sns.countplot(trainDf['target'])

In [None]:
print('The percentages of each target is:\n',(trainDf['target'].value_counts()/trainDf.shape[0])*100)

In [None]:
trainDf.isnull().sum()

So above we can see some basic information regarding the dataset starting with an overview of the data information and then moving onto showing the `value_counts` for each of the target values and displaying this on a plot using `seaborn`. We then moved onto to calculating the percentages for each target value and finally checked for any missing values. Moving forward we shall begin by filling the missing values...

### Missing Data

Here we are going to fill the missing the data in the keyword & location columns. 

For `keywords` we shall fill with: no_keyword

For `location` we shall fill with: no_location

Both operations will use the `.fillna()` function.

In [None]:
trainDf['keyword'].fillna('no_keyword', inplace=True)
trainDf['location'].fillna('no_location', inplace=True)

Using inplace=True applies the operation permanently.

Check the missing values again:

***NOTE: Above we used `.isnull()` whereas below we used `.isna()` both do same operation.***

In [None]:
trainDf.isna().sum()

### Plotting the keywords & Locations

Here we are going to look at the different keywords in a visual format using a plot.

In [None]:
trainDf['keyword'].value_counts().plot(kind='bar')

As you can see above trying to plot all the keywords on a plot does not work so we will use a selecion instead.

In [None]:
trainDf['keyword'].value_counts()[:10].plot(kind='bar')

Much better we can now see a the top 10 keywords from the data.

#### Location plot

Now we shall use the same method to plot the locations

In [None]:
trainDf['location'].value_counts()[:10].plot(kind='bar')

Not the easiest to visualise. Seeing as we know the `no_location` values is going to be the most we wont plot this one. 

In [None]:
trainDf['location'].value_counts()[1:11].plot(kind='bar')

That's alot better. We can now see the top 10 locations, excluding `no_location`

### Text preprocessing

This is quite a large topic on it's own so first let's define some key terms:

* Stopwords: 
    > A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query.

* Stemming: 
    > Stemming is the process of reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words known as a lemma.A stemming algorithm reduces the words “chocolates”, “chocolatey”, “choco” to the root word, “chocolate” and “retrieval”, “retrieved”, “retrieves” reduce to the stem “retrieve”.

* Lemmatization
    > Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item. Lemmatization is similar to stemming but it brings context to the words. So it links words with similar meaning to one word. Difference between stemming and lemmatization is that lemmatization gives proper meaningful dictionary words.

* Bag of words
    > The bag-of-words model is a simplifying representation used in natural language processing and information retrieval (IR). In this model, a text (such as a sentence or a document) is represented as the bag (multiset) of its words, disregarding grammar and even word order but keeping multiplicity.

### Importing stop words from NLTK

In [None]:
nltk.download('stopwords')
print(stopwords.words('english')[:10])

### Tf-IDF(Term Frequency and Inverse Document Frequency)


TF-IDF (term frequency-inverse document frequency) is a statistical measure that evaluates how relevant a word is to a document in a collection of documents. This is done by multiplying two metrics: how many times a word appears in a document, and the inverse document frequency of the word across a set of documents.

* Term Frequency (TF): is a scoring of the frequency of the word in the current document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. The term frequency is often divided by the document length to normalize.

     `TF=Number of times term t appers in documnet/total no of terms in document.`
* Inverse Document Frequency (IDF): is a scoring of how rare the word is across documents. IDF is a measure of how rare a term is. Rarer the term, more is the IDF score.

     `IDF=log(total no of documents/no of documents with term t in it)`

What we are doing next to is loop through the dataset and:
* Create an empty list called `sents` to store the sentences
* loop through the dataset and:
    * replace all non-alphabeticals with spaces
    * convert all words to lower case
    * initiate the `PorterStemmer`
    * select all the words apart from the stopwords 
    * applying stemming using the `PorterStemmer.stem` function
* join all the tweets and append to sents 

In [None]:
sents = []
for i in range(0,trainDf.shape[0]):
  tweets = re.sub('[^a-zA-Z]', ' ',trainDf['text'][i])
  tweets = tweets.lower()
  tweets = tweets.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  tweets = [ps.stem(word) for word in tweets if not word in set(all_stopwords)]
  tweets = ' '.join(tweets)
  sents.append(tweets)

In [None]:
sents[:5]

In [None]:
cv = CountVectorizer(max_features = 1000)
X = cv.fit_transform(sents).toarray()
X[:10]

In [None]:
y=trainDf.iloc[:,-1].values
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
classifier = MultinomialNB()
classifier.fit(X_train, Y_train)

In [None]:
prediction=classifier.predict(X_test)
cm = confusion_matrix(Y_test,prediction)
print(cm)
accuracy_score(Y_test,prediction)

In [None]:
test = pd.read_csv('../input/nlp-getting-started/test.csv')
test.head()

In [None]:
test['keyword'].fillna('no_keyword', inplace=True)
test['location'].fillna('no_location', inplace=True)

In [None]:
test.isnull().sum()

In [None]:
sentsT = []
for i in range(0,test.shape[0]):
  tweets = re.sub('[^a-zA-Z]', ' ',test['text'][i])
  tweets = tweets.lower()
  tweets = tweets.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  tweets = [ps.stem(word) for word in tweets if not word in set(all_stopwords)]
  tweets = ' '.join(tweets)
  sentsT.append(tweets)

In [None]:
sentsT[:5]

In [None]:
cv1 = CountVectorizer(max_features = 1000)
x_test = cv1.fit_transform(sentsT).toarray()
x_test[:10]

In [None]:
Y_test=test.iloc[:,-1].values
print(Y_test)

In [None]:
predictions=classifier.predict(x_test)
print(predictions)

In [None]:
sample_submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
sample_submission['target'] = predictions
sample_submission.head()

In [None]:
sample_submission.to_csv('sumbission.csv', index=False)