In [None]:
#importing libraries

import spacy
import pandas as pd
import random

In [None]:
nlp = spacy.load('en')

### Paste the following code in command line to download the English version of spaCy. 

#python -m spacy download en

In [None]:
type(nlp)

## Loading the dataset

In [None]:
tweets = pd.read_csv("../input/5000-justdoit-tweets-dataset/justdoit_tweets_2018_09_07_2.csv")

In [None]:
tweets.head()

In [None]:
tweets.shape

#### For our NLP task, we are only interested in tweets_full_text.

In [None]:
tweets['tweet_full_text'][:20]

### Random 20 tweets

In [None]:
#random tweets
random.seed(1024)

random_tweets = tweets['tweet_full_text'][random.sample(range(1,5000), 20)]
random_tweets

### Combining text as nlp variable created using spaCy will only use str

In [None]:
#combined text

combined_text = str(random_tweets)
combined_text
# len(combined_text)
# type(combined_text)

In [None]:
doc = nlp(combined_text)
doc

In [None]:
type(doc)

### Naive approach for splitting

In [None]:
#tokenization using split as space
doc.text.split()

#### Naive approach just breaks the sentence into parts based on the splitting criteria.

## Tokenization using spaCy

In [None]:
[token.orth_ for token in doc]

#### Methods with underscore suffix in spaCy returns strings whereas methods without underscore suffix returns numbers

In [None]:
[(token.orth_, token.orth) for token in doc if not token.is_punct | token.is_space | token.is_stop]

#### Tokens after removing punctuations, space and stop words

In [None]:
extracted_tokens = [token.orth_ for token in doc if not token.is_punct | token.is_space | token.is_stop]
extracted_tokens

In [None]:
only_word_tokens = [i for i in extracted_tokens if i.isalpha()]
only_word_tokens

### Tokenization based on sentences

In [None]:
list(doc.sents)

## Lemmatization

In [None]:
[word.lemma_ for word in doc]

## POS Tagging

In [None]:
pos_tag = [(word, word.tag_, word.pos_) for word in doc]
pos_tag

In [None]:
[i for i in pos_tag if i[1] == 'POS']

#### No 'POS' tags in our data so we do not have the option to exploit the owner and the possession information.

In [None]:
[j for j in pos_tag if j[2] == 'PART']

#### 'PART' tag suggests that these words are parts of the previous words.

### Noun chunks in the data

In [None]:
nouns = list(doc.noun_chunks)
nouns

### Syntactic dependency between tokens

In [None]:
[(token, token.dep_) for token in doc]

## Named Entity Recognition

In [None]:
[i for i in doc.ents]

### Entities alongwith their labels

In [None]:
[(i, i.label_, i.label) for i in doc.ents]

### Visualizing named entities alongwith labels

In [None]:
#named entities along with text labels
spacy.displacy.render(doc, style='ent', jupyter=True)

### Dependency Parser Visualization

In [None]:
spacy.displacy.render(doc, style='dep', jupyter=True)