# Introduction
Greetings! This is the first public kernel. In this kernel, I  will demonstrate few steps of NLP, few plots like wordcloud, frequency plot.

In [None]:
#general imports
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra
import seaborn as sns # plotting
import matplotlib.pyplot as plt # plotting
%matplotlib inline
import os # accessing directory structure

#NLP processing imports
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
import re
import spacy

###Vader Sentiment
#To install vaderSentiment
!pip install vaderSentiment 
from vaderSentiment import vaderSentiment
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

####Lemmatization
from nltk.stem import WordNetLemmatizer
# Lemmatize with POS Tag
from nltk.corpus import wordnet

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("/kaggle/input/twcs.csv")

In [None]:
data.shape

In [None]:
data = data.loc[:10000]

In [None]:
data.shape

In [None]:
data.head()

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
data.head(10)

In [None]:
nRow, nCol = data.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
#DataTypes
data.dtypes

In [None]:
data["text"] = data["text"].astype(str)

# TEXT CLEANING

### Remove urls,@mention, https

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+|@[^\s]+')
    return url_pattern.sub(r'', text)

In [None]:
data["textclean"] = data["text"].apply(lambda text: remove_urls(text))

In [None]:
data.head()

# Find out the top 100 words which are getting used in the text of the data

In [None]:
top_N = 100 #top 100 words

#convert list of list into text
a = data['textclean'].str.lower().str.cat(sep=' ')

# removes punctuation,numbers and returns list of words
b = re.sub('[^A-Za-z]+', ' ', a)

In [None]:
#remove all the stopwords from the text
stop_words = list(get_stop_words('en'))         
nltk_words = list(stopwords.words('english'))   
stop_words.extend(nltk_words)

In [None]:
word_tokens = word_tokenize(b) # Tokenization
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

In [None]:
# Remove characters which have length less than 2  
without_single_chr = [word for word in filtered_sentence if len(word) > 2]

# Remove numbers
cleaned_data_title = [word for word in without_single_chr if not word.isnumeric()]

**#### *Lemmatization is the process of converting a word to its base form. The difference between stemming and lemmatization is, lemmatization considers the context and converts the word to its meaningful base form, whereas stemming just removes the last few characters, often leading to incorrect meanings and spelling errors.*

# Lemmatization

#### I am using Wordnet Lemmatizer with appropriate POS tag. 
#### Function to map word with its POS tag

In [None]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
lemmatized_output = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in cleaned_data_title]
lemmatized_output = [word for word in lemmatized_output if not word.isnumeric()]

# Frequency distribution

In [None]:
word_dist = nltk.FreqDist(lemmatized_output)
top100_words = pd.DataFrame(word_dist.most_common(top_N),
                    columns=['Word', 'Frequency'])

In [None]:
plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.barplot(x="Frequency",y="Word", data=top100_words.head(10))

# WordCloud

In [None]:
def wc(data,bgcolor,title):
    plt.figure(figsize = (80,80))
    wc = WordCloud(background_color = bgcolor, max_words = 100,  max_font_size = 50)
    wc.generate(' '.join(data))
    plt.imshow(wc)
    plt.axis('off')

In [None]:
wc(lemmatized_output,'black','Common Words' )

# VADER + TEXTBLOB Sentiment Analysis

In [None]:
sent_analyser = SentimentIntensityAnalyzer()
def sentiment(text):
    return (sent_analyser.polarity_scores(text)["compound"])

In [None]:
data["Polarity"] = data["textclean"].apply(sentiment)

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
def senti(data):
    if data['Polarity'] >= 0.05:
        val = "Positive"
    elif data['Polarity'] <= -0.05:
        val = "Negative"
    else:
        val = "Neutral"
    return val

In [None]:
data['Sentiment'] = data.apply(senti, axis=1)

In [None]:
plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.countplot(x="Sentiment", data=data, 
                  palette=dict(Neutral="blue", Positive="Green", Negative="Red"))

# ASPECT MINING/ OPINION MINING

In [None]:
#import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def pos(text):
    doc = nlp(text)
    # You want list of Verb tokens 
    aspects = [token.text for token in doc if token.pos_ == "NOUN"]
    return aspects

In [None]:
data["Aspects"] = data["textclean"].apply(pos)

In [None]:
data.head()

There is scope of improvement. I will learn and update it accordingly

**P.S This is my first Kaggle Kernel and I am fairly new to python programming as well, hence my non usage of list comprehensions and functions might be evident. I highly encourage everyone to fork my code and add your own twists to increase the accuracy of both aspect extractions and sentiment analysis.**