In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading the dataset 

In [None]:
train=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train.head()

In [None]:
train.shape

# Descriptive statistics using info() and describe()

In [None]:
train.info()

info() gives us the details of datatypes of each feature and null value count.

# Target feature 

In [None]:
train['target'].value_counts()

In [None]:
import seaborn as sns
sns.countplot(train['target'])

In [None]:
(train['target'].value_counts()/train.shape[0])*100

As shown above,we saw the count of target feature value-0,1 and visualised using seaborn countplot,Finally checked the percentage of count.

# Missing Values 

In [None]:
train.isnull().sum()

As you see,there are missing values in keyword and location features.we now fill those missing values with 'no_keyword' and 'no_location'

In [None]:
train['keyword'].fillna('no_keyword',inplace=True)
train['location'].fillna('no_location',inplace=True)

In [None]:
train.isnull().sum().sum()

No missing values left.

# Visualising keyword and location features.

In [None]:
train['location'].value_counts()[:10]

In [None]:
train['location'].value_counts()[:20].plot(kind='bar')

In [None]:
train['keyword'].value_counts()[:10]

In [None]:
train['keyword'].value_counts()[:20].plot(kind='bar')

# Text Preprocessing

# Important terms and definitions

# Stop words:
                A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query.We can check list of stopwords as shown below.

In [None]:
#List of stopwords in english
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words('english'))

# Stemming:
                    Stemming is the process of reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words known as a lemma.A stemming algorithm reduces the words “chocolates”, “chocolatey”, “choco” to the root word, “chocolate” and “retrieval”, “retrieved”, “retrieves” reduce to the stem “retrieve”.

# lemmatization:
                        Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item. Lemmatization is similar to stemming but it brings context to the words. So it links words with similar meaning to one word. 
                        Difference between stemming and lemmatization is that lemmatization gives proper meaningful dictionary words.

# Bag of words:
                            The bag-of-words model is a simplifying representation used in natural language processing and information retrieval (IR). In this model, a text (such as a sentence or a document) is represented as the bag (multiset) of its words, disregarding grammar and even word order but keeping multiplicity.
  
 Let’s take an example to understand this concept in depth.

“It was the best of times”
“It was the worst of times”
“It was the age of wisdom”
“It was the age of foolishness”

We treat each sentence as a separate document and we make a list of all words from all the four documents excluding the punctuation. We get,

‘It’, ‘was’, ‘the’, ‘best’, ‘of’, ‘times’, ‘worst’, ‘age’, ‘wisdom’, ‘foolishness’

The next step is the create vectors. Vectors convert text that can be used by the machine learning algorithm.

We take the first document — “It was the best of times” and we check the frequency of words from the 10 unique words.
“it” = 1
“was” = 1
“the” = 1
“best” = 1
“of” = 1
“times” = 1
“worst” = 0
“age” = 0
“wisdom” = 0
“foolishness” = 0

Rest of the documents will be:
“It was the best of times” = [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
“It was the worst of times” = [1, 1, 1, 0, 1, 1, 1, 0, 0, 0]
“It was the age of wisdom” = [1, 1, 1, 0, 1, 0, 0, 1, 1, 0]
“It was the age of foolishness” = [1, 1, 1, 0, 1, 0, 0, 1, 0, 1]

# Tf-IDF(Term Frequency and Inverse Document Frequency):
                 TF-IDF weight is a statistical measure used to evaluate how important a word is to a document in a collection or corpus. The importance increases proportionally to the number of times a word appears in the document but is offset by the frequency of the word in the corpus.
1.     Term Frequency (TF): is a scoring of the frequency of the word in the current document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. The term frequency is often divided by the document length to normalize.
                
                TF=Number of times term t appers in documnet/total no of terms in document.

2.Inverse Document Frequency (IDF): is a scoring of how rare the word is across documents. IDF is a measure of how rare a term is. Rarer the term, more is the IDF score.                
                    
                IDF=log(total no of documents/no of documents with term t in it)
                
                
                Final score=TF*IDF.
                  

# Implementation:
                        Here i use stemming and bag of words model.Importing nessesary libraries for text preprocessing.
      1.re-used for regular expressions.
      2.nltk-natural language tool kit-one of the best library for nlp.
      3.topwords.
      4.PorterStemmer(for stemming).

In [None]:
import re
from nltk.stem.porter import PorterStemmer

In [None]:
corpus = []
for i in range(0,train.shape[0]):
  review = re.sub('[^a-zA-Z]', ' ',train['text'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

# Code Explanation:
                1.First created a list called corpus to store all the sentences.
                2.looping through all the sentences in text feature and perform following steps.
                3.Replace non alphabets with space.
                4.Convert everything into lowercase.
                5.Select all the words apart from stop words and apply stemming.
                6.Finally join all the reviews and append to corpus.
     The corpus is as follows           

In [None]:
corpus[:5]

# Creating bag of words model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1000)
X = cv.fit_transform(corpus).toarray()
X[:10]

# Spliting data into training and testing

In [None]:
from sklearn.model_selection import train_test_split
y=train.iloc[:,-1].values
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

#  Naive Bayes model 

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, Y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
prediction=classifier.predict(X_test)
cm = confusion_matrix(Y_test,prediction)
print(cm)
accuracy_score(Y_test,prediction)

# Please upvote if you like,any suggestions and mistakes put it in comments,Thank you.