In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from wordcloud import WordCloud as wc
from nltk.corpus import stopwords
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
from pandas import get_dummies
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib
import warnings
import sklearn
import string
import scipy
import numpy
import nltk
import json
import sys
import csv
import os

print('matplotlib: {}'.format(matplotlib.__version__))
print('sklearn: {}'.format(sklearn.__version__))
print('scipy: {}'.format(scipy.__version__))
print('seaborn: {}'.format(sns.__version__))
print('pandas: {}'.format(pd.__version__))
print('numpy: {}'.format(np.__version__))
print('Python: {}'.format(sys.version))

In [None]:
sns.set(style='white', context='notebook', palette='deep')
pylab.rcParams['figure.figsize'] = 12,8
warnings.filterwarnings('ignore')
mpl.style.use('ggplot')
sns.set_style('white')
%matplotlib inline

<a id="55"></a> <br>
## 5-5 NLTK
In this kernel, we use the NLTK library So, before we begin the next step, we will first introduce this library.
The Natural Language Toolkit (NLTK) is one of the leading platforms for working with human language data and Python, the module NLTK is used for natural language processing. NLTK is literally an acronym for Natural Language Toolkit. With it you can tokenize words and sentences.
NLTK is a library of Python that can mine (scrap and upload data) and analyse very large amounts of textual data using computational methods.
<img src='https://arts.unimelb.edu.au/__data/assets/image/0005/2735348/nltk.jpg' width=300 height=300>

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
 
data = "All work and no play makes jack a dull boy, all work and no play"
print(word_tokenize(data))

<a id="551"></a> <br>
All of them are words except the comma. Special characters are treated as separate tokens.

## 5-5-1 Tokenizing sentences
The same principle can be applied to sentences. Simply change the to sent_tokenize()
We have added two sentences to the variable data:

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
print(sent_tokenize(data))

<a id="552"></a> <br>
## 5-5-2 NLTK and arrays
If you wish to you can store the words and sentences in arrays

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
 
phrases = sent_tokenize(data)
words = word_tokenize(data)
 
print(phrases)
print(words)

<a id="553"></a> <br>
## 5-5-3 NLTK stop words
Stop words are basically a set of commonly used words in any language, not just English. The reason why stop words are critical to many applications is that, if we remove the words that are very commonly used in a given language, we can focus on the important words instead.[12]

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
stopWords = set(stopwords.words('english'))
words = word_tokenize(data)
wordsFiltered = []
 
for w in words:
    if w not in stopWords:
        wordsFiltered.append(w)
 
print(wordsFiltered)

A module has been imported:



In [None]:
from nltk.corpus import stopwords

We get a set of English stop words using the line:



In [None]:
stopWords = set(stopwords.words('english'))

The returned list stopWords contains 153 stop words on my computer.
You can view the length or contents of this array with the lines:

In [None]:
print(len(stopWords))
print(stopWords)

We create a new list called wordsFiltered which contains all words which are not stop words.
To create it we iterate over the list of words and only add it if its not in the stopWords list.

In [None]:
for w in words:
    if w not in stopWords:
        wordsFiltered.append(w)

<a id="554"></a> <br>
## 5-5-4 NLTK – stemming
Start by defining some words:

In [None]:
words = ["game","gaming","gamed","games"]

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

And stem the words in the list using:

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

words = ["game","gaming","gamed","games"]
ps = PorterStemmer()
 
for word in words:
    print(ps.stem(word))

<a id="555"></a> <br>
## 5-5-5  NLTK speech tagging
The module NLTK can automatically tag speech.
Given a sentence or paragraph, it can label words such as verbs, nouns and so on.

NLTK – speech tagging example
The example below automatically tags words with a corresponding class.

In [None]:
import nltk
from nltk.tokenize import PunktSentenceTokenizer
 
document = 'Whether you\'re new to programming or an experienced developer, it\'s easy to learn and use Python.'
sentences = nltk.sent_tokenize(document)   
for sent in sentences:
    print(nltk.pos_tag(nltk.word_tokenize(sent)))

We can filter this data based on the type of word:

In [None]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
 
document = 'Today the Netherlands celebrates King\'s Day. To honor this tradition, the Dutch embassy in San Francisco invited me to'
sentences = nltk.sent_tokenize(document)   
 
data = []
for sent in sentences:
    data = data + nltk.pos_tag(nltk.word_tokenize(sent))
 
for word in data: 
    if 'NNP' in word[1]: 
        print(word)

In [None]:
sns.set(style='white', context='notebook', palette='deep')
pylab.rcParams['figure.figsize'] = 12,8
warnings.filterwarnings('ignore')
mpl.style.use('ggplot')
sns.set_style('white')
%matplotlib inline

<a id="556"></a> <br>
## 5-5-6 Natural Language Processing – prediction
We can use natural language processing to make predictions. Example: Given a product review, a computer can predict if its positive or negative based on the text. In this article you will learn how to make a prediction program based on natural language processing.

<a id="55561"></a> <br>
### 5-5-5-6-1  nlp prediction example
Given a name, the classifier will predict if it’s a male or female.

To create our analysis program, we have several steps:

1. Data preparation
1. Feature extraction
1. Training
1. Prediction
1. Data preparation
The first step is to prepare data. We use the names set included with nltk.

In [None]:
from nltk.corpus import names
 
# Load data and training 
names = ([(name, 'male') for name in names.words('male.txt')] + 
	 [(name, 'female') for name in names.words('female.txt')])

This dataset is simply a collection of tuples. To give you an idea of what the dataset looks like:

In [None]:
[(u'Aaron', 'male'), (u'Abbey', 'male'), (u'Abbie', 'male')]
[(u'Zorana', 'female'), (u'Zorina', 'female'), (u'Zorine', 'female')]

You can define your own set of tuples if you wish, its simply a list containing many tuples.

Feature extraction
Based on the dataset, we prepare our feature. The feature we will use is the last letter of a name:
We define a featureset using:

featuresets = [(gender_features(n), g) for (n,g) in names]
and the features (last letters) are extracted using:

In [None]:
def gender_features(word): 
    return {'last_letter': word[-1]}

Training and prediction
We train and predict using:

In [None]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import names
 
def gender_features(word): 
    return {'last_letter': word[-1]} 
 
# Load data and training 
names = ([(name, 'male') for name in names.words('male.txt')] + 
	 [(name, 'female') for name in names.words('female.txt')])
 
featuresets = [(gender_features(n), g) for (n,g) in names] 
train_set = featuresets
classifier = nltk.NaiveBayesClassifier.train(train_set) 
 
# Predict
print(names)

In [None]:
print(featuresets)

In [None]:
print(classifier.classify(gender_features('Frank')))

In [None]:
print(classifier.classify(gender_features('Frankie')))

If you want to give the name during runtime, change the last line to:


In [None]:
# Predict, you can change name
name = 'Sarah'
print(classifier.classify(gender_features(name)))

<a id="6"></a> <br>
## 6- EDA
 In this section, you'll learn how to use graphical and numerical techniques to begin uncovering the structure of your data. 
 
* Which variables suggest interesting relationships?
* Which observations are unusual?
* Analysis of the features!

By the end of the section, you'll be able to answer these questions and more, while generating graphics that are both insightful and beautiful.  then We will review analytical and statistical operations:

1. Data Collection
1. Visualization
1. Data Cleaning
1. Data Preprocessing
<img src="http://s9.picofile.com/file/8338476134/EDA.png" width=400 height=400>

 ###### [Go to top](#top)

<a id="61"></a> <br>
## 6-1 Data Collection
**Data collection** is the process of gathering and measuring data, information or any variables of interest in a standardized and established manner that enables the collector to answer or test hypothesis and evaluate outcomes of the particular collection.[techopedia]

I start Collection Data by the training and testing datasets into **Pandas DataFrames**.
###### [Go to top](#top)

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

**<< Note 1 >>**

* Each **row** is an observation (also known as : sample, example, instance, record).
* Each **column** is a feature (also known as: Predictor, attribute, Independent Variable, input, regressor, Covariate).
###### [Go to top](#top)

In [None]:
train.sample(1) 

In [None]:
train.head()

In [None]:
test.sample(1) 

Or you can use others command to explorer dataset, such as 

In [None]:
train.tail(1)

<a id="611"></a> <br>
## 6-1-1 Features
Features can be from following types:
* numeric
* categorical
* ordinal
* datetime
* coordinates

Find the type of features in **Qoura dataset**?!

For getting some information about the dataset you can use **info()** command.

In [None]:
train.info()

In [None]:
test.info()

<a id="612"></a> <br>
## 6-1-2 Explorer Dataset
1- Dimensions of the dataset.

2- Peek at the data itself.

3- Statistical summary of all attributes.

4- Breakdown of the data by the class variable.

Don’t worry, each look at the data is **one command**. These are useful commands that you can use again and again on future projects.
###### [Go to top](#top)

In [None]:
# shape for train and test
print('Shape of train:',train.shape)
print('Shape of test:',test.shape)

In [None]:
#columns*rows
train.size

After loading the data via **pandas**, we should checkout what the content is, description and via the following:

In [None]:
type(train)

In [None]:
type(test)

In [None]:
train.describe()

To pop up 5 random rows from the data set, we can use **sample(5)**  function and find the type of features.

In [None]:
train.sample(5) 

<a id="62"></a> <br>
## 6-2 Data Cleaning
When dealing with real-world data, dirty data is the norm rather than the exception. We continuously need to predict correct values, impute missing ones, and find links between various data artefacts such as schemas and records. We need to stop treating data cleaning as a piecemeal exercise (resolving different types of errors in isolation), and instead leverage all signals and resources (such as constraints, available statistics, and dictionaries) to accurately predict corrective actions.

The primary goal of data cleaning is to detect and remove errors and **anomalies** to increase the value of data in analytics and decision making. While it has been the focus of many researchers for several years, individual problems have been addressed separately. These include missing value imputation, outliers detection, transformations, integrity constraints violations detection and repair, consistent query answering, deduplication, and many other related problems such as profiling and constraints mining.[4]
###### [Go to top](#top)

How many NA elements in every column!!

Good news, it is Zero!

To check out how many null info are on the dataset, we can use **isnull().sum()**.

In [None]:
train.isnull().sum()

But if we had , we can just use **dropna()**(be careful sometimes you should not do this!)

In [None]:
# remove rows that have NA's
print('Before Droping',train.shape)
train = train.dropna()
print('After Droping',train.shape)


We can get a quick idea of how many instances (rows) and how many attributes (columns) the data contains with the shape property.

To print dataset **columns**, we can use columns atribute.

In [None]:
train.columns

You see number of unique item for Target  with command below:

In [None]:
train_target = train['target'].values

np.unique(train_target)

YES, quora problem is a **binary classification**! :)

To check the first 5 rows of the data set, we can use head(5).

In [None]:
train.head(5) 

Or to check out last 5 row of the data set, we use tail() function.

In [None]:
train.tail() 

To give a **statistical summary** about the dataset, we can use **describe()**


In [None]:
train.describe() 

As you can see, the statistical information that this command gives us is not suitable for this type of data
**describe() is more useful for numerical data sets**

<a id="63"></a> <br>
## 6-3 Data Preprocessing
**Data preprocessing** refers to the transformations applied to our data before feeding it to the algorithm.
 
Data Preprocessing is a technique that is used to convert the raw data into a clean data set. In other words, whenever the data is gathered from different sources it is collected in raw format which is not feasible for the analysis.
there are plenty of steps for data preprocessing and we just listed some of them in general(Not just for Quora) :
1. removing Target column (id)
1. Sampling (without replacement)
1. Making part of iris unbalanced and balancing (with undersampling and SMOTE)
1. Introducing missing values and treating them (replacing by average values)
1. Noise filtering
1. Data discretization
1. Normalization and standardization
1. PCA analysis
1. Feature selection (filter, embedded, wrapper)
1. Etc.

What methods of preprocessing can we run on  Quora?! 
###### [Go to top](#top)

**<< Note 2 >>**
in pandas's data frame you can perform some query such as "where"

In [None]:
train.where(train ['target']==1).count()

As you can see in the below in python, it is so easy perform some query on the dataframe:

In [None]:
train[train['target']>1]

Some examples of questions that they are insincere

In [None]:
train[train['target']==1].head(5)

<a id="631"></a> <br>
## 6-3-1 Is data set imbalance?


In [None]:
train_target.mean()

A large part of the data is unbalanced, but **how can we  solve it?**

In [None]:
train["target"].value_counts()
# data is imbalance

**Imbalanced dataset** is relevant primarily in the context of supervised machine learning involving two or more classes. 

**Imbalance** means that the number of data points available for different classes is different:
If there are two classes, then balanced data would mean 50% points for each of the class. For most machine learning techniques, little imbalance is not a problem. So, if there are 60% points for one class and 40% for the other class, it should not cause any significant performance degradation. Only when the class imbalance is high, e.g. 90% points for one class and 10% for the other, standard optimization criteria or performance measures may not be as effective and would need modification.


<img src='https://www.datascience.com/hs-fs/hubfs/imbdata.png?t=1542328336307&width=487&name=imbdata.png'>
[Image source](http://api.ning.com/files/vvHEZw33BGqEUW8aBYm4epYJWOfSeUBPVQAsgz7aWaNe0pmDBsjgggBxsyq*8VU1FdBshuTDdL2-bp2ALs0E-0kpCV5kVdwu/imbdata.png)

A typical example of imbalanced data is encountered in e-mail classification problem where emails are classified into ham or spam. The number of spam emails is usually lower than the number of relevant (ham) emails. So, using the original distribution of two classes leads to imbalanced dataset.

Using accuracy as a performace measure for highly imbalanced datasets is not a good idea. For example, if 90% points belong to the true class in a binary  classification problem, a default prediction is true for all data poimts leads to a classifier which is 90% accurate, even though the classifier has not learnt anything about the classification problem at hand![9]

<a id="632"></a> <br>
## 6-3-2 Exploring the question

In [None]:
question = train['question_text']
i=0
for q in question[:5]:
    i=i+1
    print('sample '+str(i)+':' ,q)

In [None]:
text_withnumber = train['question_text']
result = ''.join([i for i in text_withnumber if not i.isdigit()])

<a id="632"></a> <br>
## 6-3-2 Some Feature Engineering

[NLTK](https://www.nltk.org/) is one of the leading platforms for working with human language data and Python, the module NLTK is used for natural language processing. NLTK is literally an acronym for Natural Language Toolkit.

We get a set of **English stop** words using the line

In [None]:
#from nltk.corpus import stopwords
eng_stopwords = set(stopwords.words("english"))

The returned list stopWords contains **179 stop words**  on my computer.
You can view the length or contents of this array with the lines:

In [None]:
print(len(eng_stopwords))
print(eng_stopwords)

The metafeatures that we'll create based on  SRK's  EDAs, [sudalairajkumar](http://http://www.kaggle.com/sudalairajkumar/simple-feature-engg-notebook-spooky-author) and [tunguz](https://www.kaggle.com/tunguz/just-some-simple-eda) are:
1. Number of words in the text
1. Number of unique words in the text
1. Number of characters in the text
1. Number of stopwords
1. Number of punctuations
1. Number of upper case words
1. Number of title case words
1. Average length of the words

###### [Go to top](#top)

Number of words in the text 

In [None]:
train["num_words"] = train["question_text"].apply(lambda x: len(str(x).split()))
test["num_words"] = test["question_text"].apply(lambda x: len(str(x).split()))
print('maximum of num_words in train',train["num_words"].max())
print('min of num_words in train',train["num_words"].min())
print("maximum of  num_words in test",test["num_words"].max())
print('min of num_words in train',test["num_words"].min())

Number of unique words in the text

In [None]:
train["num_unique_words"] = train["question_text"].apply(lambda x: len(set(str(x).split())))
test["num_unique_words"] = test["question_text"].apply(lambda x: len(set(str(x).split())))
print('maximum of num_unique_words in train',train["num_unique_words"].max())
print('mean of num_unique_words in train',train["num_unique_words"].mean())
print("maximum of num_unique_words in test",test["num_unique_words"].max())
print('mean of num_unique_words in train',test["num_unique_words"].mean())

Number of characters in the text 

In [None]:
train["num_chars"] = train["question_text"].apply(lambda x: len(str(x)))
test["num_chars"] = test["question_text"].apply(lambda x: len(str(x)))
print('maximum of num_chars in train',train["num_chars"].max())
print("maximum of num_chars in test",test["num_chars"].max())

Number of stopwords in the text

In [None]:
train["num_stopwords"] = train["question_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
test["num_stopwords"] = test["question_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
print('maximum of num_stopwords in train',train["num_stopwords"].max())
print("maximum of num_stopwords in test",test["num_stopwords"].max())

Number of punctuations in the text

In [None]:
train["num_punctuations"] =train['question_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
test["num_punctuations"] =test['question_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
print('maximum of num_punctuations in train',train["num_punctuations"].max())
print("maximum of num_punctuations in test",test["num_punctuations"].max())

Number of title case words in the text

In [None]:
train["num_words_upper"] = train["question_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test["num_words_upper"] = test["question_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
print('maximum of num_words_upper in train',train["num_words_upper"].max())
print("maximum of num_words_upper in test",test["num_words_upper"].max())

Number of title case words in the text

In [None]:
train["num_words_title"] = train["question_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
test["num_words_title"] = test["question_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
print('maximum of num_words_title in train',train["num_words_title"].max())
print("maximum of num_words_title in test",test["num_words_title"].max())

 Average length of the words in the text 

In [None]:
train["mean_word_len"] = train["question_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test["mean_word_len"] = test["question_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
print('mean_word_len in train',train["mean_word_len"].max())
print("mean_word_len in test",test["mean_word_len"].max())

We add some new feature to train and test data set now, print columns agains

In [None]:
print(train.columns)
train.head(1)

**<< Note >>**
>**Preprocessing and generation pipelines depend on a model type**

## What is Tokenizer?
Tokenizing raw text data is an important pre-processing step for many NLP methods. As explained on **wikipedia**, tokenization is “the process of breaking a stream of text up into words, phrases, symbols, or other meaningful elements called tokens.” In the context of actually working through an NLP analysis, this usually translates to converting a string like "My favorite color is blue" to a list or array like ["My", "favorite", "color", "is", "blue"].[11]

In [None]:
import nltk
mystring = "I love Kaggle"
mystring2 = "I'd love to participate in kaggle competitions."
nltk.word_tokenize(mystring)

In [None]:
nltk.word_tokenize(mystring2)

<a id="64"></a> <br>
## 6-4 Data Visualization
**Data visualization**  is the presentation of data in a pictorial or graphical format. It enables decision makers to see analytics presented visually, so they can grasp difficult concepts or identify new patterns.

> * Two** important rules** for Data visualization:
>     1. Do not put too little information
>     1. Do not put too much information

###### [Go to top](#top)

<a id="641"></a> <br>
## 6-4-1 CountPlot

In [None]:
ax=sns.countplot(x='target',hue="target", data=train  ,linewidth=5,edgecolor=sns.color_palette("dark", 3))
plt.title('Is data set imbalance?');

In [None]:
ax = sns.countplot(y="target", hue="target", data=train)
plt.title('Is data set imbalance?');

<a id="642"></a> <br>
## 6-4-2  Pie Plot

In [None]:

ax=train['target'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%' ,shadow=True)
ax.set_title('target')
ax.set_ylabel('')
plt.show()

<a id="646"></a> <br>
## 6-4-6 WordCloud