In [1]:
import os
import re
import nltk
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

In [3]:
os.getcwd()

'C:\\Users\\praveen\\DS_Notebooks\\nltk'

In [4]:
data = pd.read_csv("train.csv", sep=',')

In [5]:
data.head()

Unnamed: 0,Id,Category,Tweet
0,635769805279248384,negative,Not Available
1,635930169241374720,neutral,IOS 9 App Transport Security. Mm need to check...
2,635950258682523648,neutral,"Mar if you have an iOS device, you should down..."
3,636030803433009153,negative,@jimmie_vanagon my phone does not run on lates...
4,636100906224848896,positive,Not sure how to start your publication on iOS?...


In [6]:
len(data)

86

In [7]:
data.Category.value_counts()

positive    41
neutral     34
negative    11
Name: Category, dtype: int64

In [8]:
tweets = data[data.Tweet != 'Not Available']
tweets = pd.DataFrame([tweets['Tweet']]).T

In [9]:
len(tweets)

80

In [10]:
tweets.head()

Unnamed: 0,Tweet
1,IOS 9 App Transport Security. Mm need to check...
2,"Mar if you have an iOS device, you should down..."
3,@jimmie_vanagon my phone does not run on lates...
4,Not sure how to start your publication on iOS?...
5,"Two Dollar Tuesday is here with Forklift 2, Qu..."


In [11]:
tweets['Tweet'] = tweets['Tweet'].str.replace('http\S+|www.\S+', '', case=False)

In [12]:
tweets['Tweet'][:30]

1     IOS 9 App Transport Security. Mm need to check...
2     Mar if you have an iOS device, you should down...
3     @jimmie_vanagon my phone does not run on lates...
4     Not sure how to start your publication on iOS?...
5     Two Dollar Tuesday is here with Forklift 2, Qu...
6     If you're not already signed up to test my iOS...
7     YouTube Gaming Officially Launches On Web, And...
8     YouTube Gaming Launches Tomorrow with iOS and ...
9     @astrill Yashan from BBC @bbcchinese the VPN a...
10    Parkrun app for iOS downloaded Where have you ...
11    Today @YouTubeGaming launches, with apps for i...
12    Got a project you want to work on? Need help w...
13    Met with iOS Developer today. We may have a "g...
14    #CrossSkyHigh is going IOS #saturday. For now ...
15    What's the best way to get audio recordings fr...
16    Five Great Free Apps and Games for iOS - Augus...
17    @clayrussell @Nostradamion looks interesting b...
18    See news through the eyes of real people &

In [13]:
mytext = "this is my @string circle"
re.sub(r'\@\S*', '', mytext)

'this is my  circle'

In [14]:
tweets['Tweet'] = tweets['Tweet'].str.replace(r'\@\S*', '', case=False)

In [15]:
tweets[:30]

Unnamed: 0,Tweet
1,IOS 9 App Transport Security. Mm need to check...
2,"Mar if you have an iOS device, you should down..."
3,my phone does not run on latest IOS which may...
4,Not sure how to start your publication on iOS?...
5,"Two Dollar Tuesday is here with Forklift 2, Qu..."
6,If you're not already signed up to test my iOS...
7,"YouTube Gaming Officially Launches On Web, And..."
8,YouTube Gaming Launches Tomorrow with iOS and ...
9,Yashan from BBC the VPN access on IOS may be...
10,Parkrun app for iOS downloaded Where have you ...


In [16]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [18]:
for word in stop_words:
    tweets['Tweet'] = tweets['Tweet'].replace(to_replace=r'\b%s\b'%word, value="", regex=True)

In [19]:
tweets[:30]

Unnamed: 0,Tweet
1,IOS 9 App Transport Security. Mm need check ...
2,"Mar iOS device, download app :"
3,phone run latest IOS may account proble...
4,Not sure start publication iOS? We' live ...
5,"Two Dollar Tuesday Forklift 2, QuickKey iO..."
6,"If already signed test iOS game, nows ch..."
7,"YouTube Gaming Officially Launches On Web, And..."
8,YouTube Gaming Launches Tomorrow iOS Android...
9,Yashan BBC VPN access IOS may limited C...
10,Parkrun app iOS downloaded Where ?Great a...


In [20]:
train = []
sia = SentimentIntensityAnalyzer()
sia

<nltk.sentiment.vader.SentimentIntensityAnalyzer at 0x2645cb4cfd0>

In [21]:
tweets = list(tweets['Tweet'])

In [22]:
for tweet in tweets:
    print(tweet)
    s = sia.polarity_scores(tweet)
    for v in s:
        print('{0}: {1}, '.format(v, s[v]), end='\n')
        if v == 'compound' and s[v] > 0:
            train.append((tweet, "pos"))
        elif v == 'compound' and s[v] <= 0:
            train.append((tweet, "neg"))
    

IOS 9 App Transport Security. Mm need  check   3rd party network pod supports  
neg: 0.0, 
neu: 0.542, 
pos: 0.458, 
compound: 0.765, 
Mar     iOS device,   download  app : 
neg: 0.0, 
neu: 1.0, 
pos: 0.0, 
compound: 0.0, 
  phone   run  latest IOS  may account  problem   day .. time   replaced
neg: 0.213, 
neu: 0.787, 
pos: 0.0, 
compound: -0.4019, 
Not sure   start  publication  iOS? We'  live helping  ask  anything sessions today  Friday 
neg: 0.129, 
neu: 0.725, 
pos: 0.145, 
compound: 0.0613, 
Two Dollar Tuesday    Forklift 2, QuickKey  iOS  Suite  Pages   $1.99 today:   
neg: 0.0, 
neu: 1.0, 
pos: 0.0, 
compound: 0.0, 
If   already signed   test  iOS game, nows  chance! 
neg: 0.0, 
neu: 0.753, 
pos: 0.247, 
compound: 0.3164, 
YouTube Gaming Officially Launches On Web, Android, iOS On August 26: YouTube  finally going  r...  #webseries
neg: 0.0, 
neu: 1.0, 
pos: 0.0, 
compound: 0.0, 
YouTube Gaming Launches Tomorrow  iOS  Android Apps  Go Head--Head  Twitch  #ios #game
neg: 0.0, 


In [23]:
len(train)

80

In [24]:
train

[('IOS 9 App Transport Security. Mm need  check   3rd party network pod supports  ',
  'pos'),
 ('Mar     iOS device,   download  app : ', 'neg'),
 ('  phone   run  latest IOS  may account  problem   day .. time   replaced',
  'neg'),
 ("Not sure   start  publication  iOS? We'  live helping  ask  anything sessions today  Friday ",
  'pos'),
 ('Two Dollar Tuesday    Forklift 2, QuickKey  iOS  Suite  Pages   $1.99 today:   ',
  'neg'),
 ('If   already signed   test  iOS game, nows  chance! ', 'pos'),
 ('YouTube Gaming Officially Launches On Web, Android, iOS On August 26: YouTube  finally going  r...  #webseries',
  'neg'),
 ('YouTube Gaming Launches Tomorrow  iOS  Android Apps  Go Head--Head  Twitch  #ios #game',
  'neg'),
 (" Yashan  BBC   VPN access  IOS may  limited  China' military parade? Any chance    chat  ?",
  'pos'),
 ("Parkrun app  iOS downloaded Where     ?Great app easier access  info &amp; ready  Saturday' run ",
  'pos'),
 ('Today  launches,  apps  iOS  Android devices   

In [25]:
dictionary = set(word.lower() for tweet in train for word in word_tokenize(tweet[0]))
dictionary

{'!',
 '#',
 '$',
 '&',
 "'",
 "''",
 "'pope",
 '(',
 ')',
 '+',
 ',',
 '-',
 '--',
 '.',
 '..',
 '...',
 '/',
 '0,12,0/0,12,1',
 '0.12.1',
 '1.99',
 '1/3',
 '10',
 '11th',
 '12',
 '12est',
 '1st',
 '2',
 '2.0',
 '200',
 '21',
 '219',
 '23rd',
 '24',
 '25th',
 '26',
 '27th',
 '28',
 '29',
 '2nd',
 '34th',
 '3pm',
 '3rd',
 '4',
 '4th',
 '5-yearly',
 '5.1.1',
 '6',
 '6.1.6',
 '600',
 '7-8p',
 '8',
 '8-',
 '8/9',
 '8pm',
 '8th',
 '9',
 '9/23',
 '9th',
 ':',
 ';',
 '?',
 '``',
 'a',
 'absentia',
 'access',
 'account',
 'actually',
 'add',
 'address',
 'addresses',
 'aimed',
 'albums',
 'ale',
 'along',
 'already',
 'always',
 'amp',
 'and',
 'android',
 'anniversary',
 'annnounced',
 'annonced',
 'announce',
 'announced',
 'annoying',
 'annulling',
 'annulment',
 'annulments',
 'another',
 'any',
 'anyone',
 'anything',
 'app',
 'appear',
 'applause',
 'apple',
 'appreciate',
 'apps',
 'are',
 'arrives',
 'as',
 'ask',
 'asked',
 'attendant',
 'audience',
 'audio',
 'august',
 'avails',
 '

In [26]:
t = [({word: (word in word_tokenize(x[0])) for word in dictionary}, x[1]) for x in train]

In [27]:
t

[({'!': False,
   '#': False,
   '$': False,
   '&': False,
   "'": False,
   "''": False,
   "'pope": False,
   '(': False,
   ')': False,
   '+': False,
   ',': False,
   '-': False,
   '--': False,
   '.': True,
   '..': False,
   '...': False,
   '/': False,
   '0,12,0/0,12,1': False,
   '0.12.1': False,
   '1.99': False,
   '1/3': False,
   '10': False,
   '11th': False,
   '12': False,
   '12est': False,
   '1st': False,
   '2': False,
   '2.0': False,
   '200': False,
   '21': False,
   '219': False,
   '23rd': False,
   '24': False,
   '25th': False,
   '26': False,
   '27th': False,
   '28': False,
   '29': False,
   '2nd': False,
   '34th': False,
   '3pm': False,
   '3rd': True,
   '4': False,
   '4th': False,
   '5-yearly': False,
   '5.1.1': False,
   '6': False,
   '6.1.6': False,
   '600': False,
   '7-8p': False,
   '8': False,
   '8-': False,
   '8/9': False,
   '8pm': False,
   '8th': False,
   '9': True,
   '9/23': False,
   '9th': False,
   ':': False,
   ';': False

In [28]:
classifier = nltk.NaiveBayesClassifier.train(t)

In [29]:
test_data = "Why should I tweet against someone?"

In [30]:
test_data_features = {word.lower(): (word in word_tokenize(test_data.lower())) for word in dictionary}

In [31]:
test_data_features

{'!': False,
 '#': False,
 '$': False,
 '&': False,
 "'": False,
 "''": False,
 "'pope": False,
 '(': False,
 ')': False,
 '+': False,
 ',': False,
 '-': False,
 '--': False,
 '.': False,
 '..': False,
 '...': False,
 '/': False,
 '0,12,0/0,12,1': False,
 '0.12.1': False,
 '1.99': False,
 '1/3': False,
 '10': False,
 '11th': False,
 '12': False,
 '12est': False,
 '1st': False,
 '2': False,
 '2.0': False,
 '200': False,
 '21': False,
 '219': False,
 '23rd': False,
 '24': False,
 '25th': False,
 '26': False,
 '27th': False,
 '28': False,
 '29': False,
 '2nd': False,
 '34th': False,
 '3pm': False,
 '3rd': False,
 '4': False,
 '4th': False,
 '5-yearly': False,
 '5.1.1': False,
 '6': False,
 '6.1.6': False,
 '600': False,
 '7-8p': False,
 '8': False,
 '8-': False,
 '8/9': False,
 '8pm': False,
 '8th': False,
 '9': False,
 '9/23': False,
 '9th': False,
 ':': False,
 ';': False,
 '?': True,
 '``': False,
 'a': False,
 'absentia': False,
 'access': False,
 'account': False,
 'actually': False,

In [32]:
classifier.classify(test_data_features)

'pos'

In [33]:
test_data2 = "It's a beautiful day!"

In [34]:
test_data_features2 = {word.lower(): (word in word_tokenize(test_data2.lower())) for word in dictionary}

In [35]:
test_data_features2

{'!': True,
 '#': False,
 '$': False,
 '&': False,
 "'": False,
 "''": False,
 "'pope": False,
 '(': False,
 ')': False,
 '+': False,
 ',': False,
 '-': False,
 '--': False,
 '.': False,
 '..': False,
 '...': False,
 '/': False,
 '0,12,0/0,12,1': False,
 '0.12.1': False,
 '1.99': False,
 '1/3': False,
 '10': False,
 '11th': False,
 '12': False,
 '12est': False,
 '1st': False,
 '2': False,
 '2.0': False,
 '200': False,
 '21': False,
 '219': False,
 '23rd': False,
 '24': False,
 '25th': False,
 '26': False,
 '27th': False,
 '28': False,
 '29': False,
 '2nd': False,
 '34th': False,
 '3pm': False,
 '3rd': False,
 '4': False,
 '4th': False,
 '5-yearly': False,
 '5.1.1': False,
 '6': False,
 '6.1.6': False,
 '600': False,
 '7-8p': False,
 '8': False,
 '8-': False,
 '8/9': False,
 '8pm': False,
 '8th': False,
 '9': False,
 '9/23': False,
 '9th': False,
 ':': False,
 ';': False,
 '?': False,
 '``': False,
 'a': True,
 'absentia': False,
 'access': False,
 'account': False,
 'actually': False,


In [36]:
classifier.classify(test_data_features2)

'neg'