In [9]:
#!kaggle competitions download -c tweet-sentiment-extraction

Downloading tweet-sentiment-extraction.zip to /Users/reejungkim/Documents/Git/Sentimental analysis
100%|██████████████████████████████████████| 1.39M/1.39M [00:01<00:00, 1.20MB/s]
100%|██████████████████████████████████████| 1.39M/1.39M [00:01<00:00, 1.20MB/s]


In [101]:
import pandas as pd
import numpy as np
import os

import emoji

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *


In [4]:
train = pd.read_csv("/Users/reejungkim/Documents/Git/Sentimental analysis/tweet-sentiment-extraction/train.csv")
test = pd.read_csv("/Users/reejungkim/Documents/Git/Sentimental analysis/tweet-sentiment-extraction/test.csv")

In [5]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [6]:
test.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


# Data exploration

## checking for null values

In [10]:
train.isnull().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [13]:
train.loc[pd.isnull(train.text)]

Unnamed: 0,textID,text,selected_text,sentiment
314,fdb77c3752,,,neutral


## drop null

In [15]:
train.dropna(inplace=True)

## count of unique values

In [16]:
train.sentiment.unique()

array(['neutral', 'negative', 'positive'], dtype=object)

In [17]:
train.shape, test.shape

((27480, 4), (3534, 3))

In [45]:
train.pivot_table(index='sentiment', values='textID', 
                  aggfunc='count', margins=True)


Unnamed: 0_level_0,textID
sentiment,Unnamed: 1_level_1
negative,7781
neutral,11117
positive,8582
All,27480


In [46]:
train.pivot_table(index='sentiment', values='textID', 
                 aggfunc = lambda x: x.count()/train['sentiment'].count())


Unnamed: 0_level_0,textID
sentiment,Unnamed: 1_level_1
negative,0.283151
neutral,0.404549
positive,0.3123


# Feature engineering

## URL 

In [97]:
def find_url(string): 
    text = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',string)
    if text!=[]:
        return "".join(text)
    return None

In [98]:
train['url']=train['text'].apply(lambda x:find_url(x))

In [99]:
train.loc[pd.notnull(train.url)]

Unnamed: 0,textID,text,selected_text,sentiment,url
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral,http://www.dothebouncy.com/smf
17,919fa93391,i`ve been sick for the past few days and thus...,sick,negative,http://tinyurl.com/mnf4kw
35,4f5267ad70,"Thats it, its the end. Tears for Fears vs Eric...","Thats it, its the end. Tears for Fears",neutral,http://bit.ly/2Hpbg4
50,a3ae670885,Then you should check out http://twittersucks...,Then you should check out http://twittersucks....,neutral,http://twittersucks.com
57,6086b1f016,will be back later. http://plurk.com/p/rp3k7,will be back later.,neutral,http://plurk.com/p/rp3k7
...,...,...,...,...,...
27374,b49385ebb7,"says Finally, Im home. http://plurk.com/p/rr121","says Finally, Im home.",neutral,http://plurk.com/p/rr121
27384,f94af8cb85,This is a much better tool than some I have co...,This is a much better tool,positive,http://www.tweepular.com
27386,e149ebd3a1,#vwll2009 Would one of the VWLLers want to add...,ch appreciat,positive,http://bit.ly/BF5sh
27463,a38bf809b0,LIKE DREW SAID 'GIVE TC A CHANCE' WE WILL MIS...,MISS,negative,http://bit.ly/r6RfC


## Emoticons

In [143]:
ex = "😍 lol 😆 😂 "
ex

emoji.demojize(ex)

':smiling_face_with_heart-eyes: lol :grinning_squinting_face: :face_with_tears_of_joy: '

In [144]:
re.findall(':(.*?):', emoji.demojize(ex) )  #<-mac
#re.findall( r'\:(.*?)\:', emoji.demojize(ex) )  <- window

['smiling_face_with_heart-eyes',
 'grinning_squinting_face',
 'face_with_tears_of_joy']

In [168]:
emoji.UNICODE_EMOJI

{'🥇': ':1st_place_medal:',
 '🥈': ':2nd_place_medal:',
 '🥉': ':3rd_place_medal:',
 '🆎': ':AB_button_(blood_type):',
 '🏧': ':ATM_sign:',
 '🅰': ':A_button_(blood_type):',
 '🇦🇫': ':Afghanistan:',
 '🇦🇱': ':Albania:',
 '🇩🇿': ':Algeria:',
 '🇦🇸': ':American_Samoa:',
 '🇦🇩': ':Andorra:',
 '🇦🇴': ':Angola:',
 '🇦🇮': ':Anguilla:',
 '🇦🇶': ':Antarctica:',
 '🇦🇬': ':Antigua_&_Barbuda:',
 '♒': ':Aquarius:',
 '🇦🇷': ':Argentina:',
 '♈': ':Aries:',
 '🇦🇲': ':Armenia:',
 '🇦🇼': ':Aruba:',
 '🇦🇨': ':Ascension_Island:',
 '🇦🇺': ':Australia:',
 '🇦🇹': ':Austria:',
 '🇦🇿': ':Azerbaijan:',
 '🔙': ':BACK_arrow:',
 '🅱': ':B_button_(blood_type):',
 '🇧🇸': ':Bahamas:',
 '🇧🇭': ':Bahrain:',
 '🇧🇩': ':Bangladesh:',
 '🇧🇧': ':Barbados:',
 '🇧🇾': ':Belarus:',
 '🇧🇪': ':Belgium:',
 '🇧🇿': ':Belize:',
 '🇧🇯': ':Benin:',
 '🇧🇲': ':Bermuda:',
 '🇧🇹': ':Bhutan:',
 '🇧🇴': ':Bolivia:',
 '🇧🇦': ':Bosnia_&_Herzegovina:',
 '🇧🇼': ':Botswana:',
 '🇧🇻': ':Bouvet_Island:',
 '🇧🇷': ':Brazil:',
 '🇮🇴': ':British_Indian_Ocean_Territory:',
 '🇻🇬': ':British_Vir

In [177]:
def extract_emojis(s):
    emoticon = "".join(c for c in s if c in emoji.UNICODE_EMOJI)
    if emoticon!="":
        return emoticon
    return None

In [188]:
extract_emojis( "😘😘")

'😘😘'

In [190]:
extract_emojis('text without emojis')

In [191]:
train['emoji']=train['text'].apply(lambda x: extract_emojis(x))

In [192]:
train.loc[pd.notnull(train.emoji)]

Unnamed: 0,textID,text,selected_text,sentiment,url,emoji
