# NLP Sentiment Analysis

In [85]:
import pandas as pd
import numpy as np
import re
from nltk import word_tokenize, sent_tokenize, wordpunct_tokenize
from nltk.stem import SnowballStemmer, PorterStemmer
from nltk.corpus import stopwords

Download the `nltk` resources `punkt`, `wordnet` and `stopwords`

In [88]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sanjiv\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [91]:
csv_file = r"C:\Users\sanjiv\Documents\Datasets\NLP\NLP_Case_Study\Data.csv"
tweet_data = pd.read_csv(csv_file)

In [92]:
tweet_data.head()

Unnamed: 0,tweet_id,sentiment,name,text,tweet_created,tweet_location,user_timezone
0,1,neutral,cairdin,What @dhepburn said.,24/02/15 11:35,,Eastern Time (US & Canada)
1,2,positive,jnardino,plus you've added commercials to the experienc...,24/02/15 11:15,,Pacific Time (US & Canada)
2,3,neutral,yvonnalynn,I didn't today... Must mean I need to take ano...,24/02/15 11:15,Lets Play,Central Time (US & Canada)
3,4,negative,jnardino,"it's really aggressive to blast obnoxious ""ent...",24/02/15 11:15,,Pacific Time (US & Canada)
4,5,negative,jnardino,and it's a really big bad thing about it,24/02/15 11:14,,Pacific Time (US & Canada)


### Tokenize and clean the tweets

Pre-process the tweets in the following order 
- sentence tokens using `sent_tokenize()`
- clean the tokens formed by using `re` library
- token the cleaned token into word token using `word_tokenize()`

In [93]:
def sent_tokenize_tweets(tweet_column):
    sent_tweet_column = []
    
    for line in tweet_column:
        _token = sent_tokenize(line)
        if _token != '':
            sent_tweet_column.append(_token)
    
    return sent_tweet_column 
    

In [94]:
def clean_tweets(tweets):
    
    cleaned_tweets = []
    
    for _tweet in tweets:
        # remove html tags, if any
        new_tweet = re.sub('[<._*?>]+', '', _tweet[0])
        
        # remove URLs
        new_tweet = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', new_tweet)
        
        # remove Twitter handles
        new_tweet = re.sub("(@[A-Za-z0-9_]+)","", new_tweet)
        
        # remove punctuation 
        new_tweet = re.sub(r'[^\w\s]', '', new_tweet)
        
        # remove numbers
        new_tweet = re.sub(r'\d+','',new_tweet)
        
        # change to lower case
        new_tweet = new_tweet.lower()
        
        if new_tweet !='':
            cleaned_tweets.append(new_tweet)
            
    return cleaned_tweets        

In [95]:
def word_tokenize_tweets(cleaned_tweets):
    word_tweet_column=[]
    
    for tweet in cleaned_tweets:
        word_token = word_tokenize(tweet)
        
        if word_token != '':
            word_tweet_column.append(word_token)
            
    return word_tweet_column
    

In [96]:
a = sent_tokenize_tweets(tweet_data['text'])

In [97]:
b = clean_tweets(a)

In [98]:
c = word_tokenize_tweets(b)

### Stemming 
Normalize the tweet data

In [99]:
snowball = SnowballStemmer(language='english')
porter = PorterStemmer()

In [100]:
def stem_tweets(processed_words):
    stemmed_words = []
    stop_words = set(stopwords.words('english'))
    for words in processed_words:
        #stemmed_words.append(snowball.stem(word))
        print(words)
        print([snowball.stem(each_word) for each_word in words if not each_word in stop_words])
        #print('------------------------------------------')
        #print([porter.stem(each_word) for each_word in words])
        print('==========================================')
        

In [None]:
d = stem_tweets(c)

['what', 'said']
['said']
['plus', 'youve', 'added', 'commercials', 'to', 'the', 'experience', 'tacky']
['plus', 'youv', 'ad', 'commerci', 'experi', 'tacki']
['i', 'didnt', 'today', 'must', 'mean', 'i', 'need', 'to', 'take', 'another', 'trip']
['didnt', 'today', 'must', 'mean', 'need', 'take', 'anoth', 'trip']
['its', 'really', 'aggressive', 'to', 'blast', 'obnoxious', 'entertainment', 'in', 'your', 'guests', 'faces', 'amp', 'they', 'have', 'little', 'recourse']
['realli', 'aggress', 'blast', 'obnoxi', 'entertain', 'guest', 'face', 'amp', 'littl', 'recours']
['and', 'its', 'a', 'really', 'big', 'bad', 'thing', 'about', 'it']
['realli', 'big', 'bad', 'thing']
['seriously', 'would', 'pay', 'a', 'flight', 'for', 'seats', 'that', 'didnt', 'have', 'this', 'playing']
['serious', 'would', 'pay', 'flight', 'seat', 'didnt', 'play']
['yes', 'nearly', 'every', 'time', 'i', 'fly', 'vx', 'this', 'ar', 'worm', 'won', 'go', 'away']
['yes', 'near', 'everi', 'time', 'fli', 'vx', 'ar', 'worm', 'go', 'aw

In [41]:
tweet_data.head()

Unnamed: 0,tweet_id,sentiment,name,text,tweet_created,tweet_location,user_timezone,sent_token
0,1,neutral,cairdin,What @dhepburn said.,24/02/15 11:35,,Eastern Time (US & Canada),[What @dhepburn said.]
1,2,positive,jnardino,plus you've added commercials to the experienc...,24/02/15 11:15,,Pacific Time (US & Canada),[plus you've added commercials to the experien...
2,3,neutral,yvonnalynn,I didn't today... Must mean I need to take ano...,24/02/15 11:15,Lets Play,Central Time (US & Canada),[I didn't today... Must mean I need to take an...
3,4,negative,jnardino,"it's really aggressive to blast obnoxious ""ent...",24/02/15 11:15,,Pacific Time (US & Canada),"[it's really aggressive to blast obnoxious ""en..."
4,5,negative,jnardino,and it's a really big bad thing about it,24/02/15 11:14,,Pacific Time (US & Canada),[and it's a really big bad thing about it]
