In [3]:
import nltk
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\P10506243\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\twitter_samples.zip.


True

In [16]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier

import re, string, random

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)


In [5]:
if __name__ == "__main__":

    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                         for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                         for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]
    print(train_data)


[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


In [6]:
positive_dataset

[({'#followfriday': True,
   'top': True,
   'engage': True,
   'member': True,
   'community': True,
   'week': True,
   ':)': True},
  'Positive'),
 ({'hey': True,
   'james': True,
   'odd': True,
   ':/': True,
   'please': True,
   'call': True,
   'contact': True,
   'centre': True,
   '02392441234': True,
   'able': True,
   'assist': True,
   ':)': True,
   'many': True,
   'thanks': True},
  'Positive'),
 ({'listen': True,
   'last': True,
   'night': True,
   ':)': True,
   'bleed': True,
   'amazing': True,
   'track': True,
   'scotland': True},
  'Positive'),
 ({'congrats': True, ':)': True}, 'Positive'),
 ({'yeaaaah': True,
   'yippppy': True,
   'accnt': True,
   'verify': True,
   'rqst': True,
   'succeed': True,
   'get': True,
   'blue': True,
   'tick': True,
   'mark': True,
   'fb': True,
   'profile': True,
   ':)': True,
   '15': True,
   'day': True},
  'Positive'),
 ({'one': True,
   'irresistible': True,
   ':)': True,
   '#flipkartfashionfriday': True},
  'P

In [7]:
negative_dataset

[({'hopeless': True, 'tmr': True, ':(': True}, 'Negative'),
 ({'everything': True,
   'kid': True,
   'section': True,
   'ikea': True,
   'cute': True,
   'shame': True,
   "i'm": True,
   'nearly': True,
   '19': True,
   '2': True,
   'month': True,
   ':(': True},
  'Negative'),
 ({'heart': True, 'slide': True, 'waste': True, 'basket': True, ':(': True},
  'Negative'),
 ({'“': True,
   'hate': True,
   'japanese': True,
   'call': True,
   'ban': True,
   ':(': True,
   '”': True},
  'Negative'),
 ({'dang': True,
   'start': True,
   'next': True,
   'week': True,
   'work': True,
   ':(': True},
  'Negative'),
 ({'oh': True, 'god': True, 'baby': True, 'face': True, ':(': True},
  'Negative'),
 ({'make': True, 'smile': True, ':(': True}, 'Negative'),
 ({'work': True,
   'neighbour': True,
   'motor': True,
   'asked': True,
   'say': True,
   'hat': True,
   'update': True,
   'search': True,
   ':(': True},
  'Negative'),
 ({':(': True, 'sialan': True}, 'Negative'),
 ({'athabasca'

In [11]:
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))

In [9]:
positive_tweets

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
 '@97sides CONGRATS :)',
 'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days',
 '@BhaktisBanter @PallaviRuhail This one is irresistible :)\n#FlipkartFashionFriday http://t.co/EbZ0L2VENM',
 "We don't like to keep our lovely customers waiting for long! We hope you enjoy! Happy Friday! - LWWF :) https://t.co/smyYriipxI",
 '@Impatientraider On second thought, there’s just not enough time for a DD :) But new shorts entering system. Sheep must be buying.',
 'Jgh , but we have to go to Bayan :D bye',
 'As an act of mischievousness, am calling the ETL layer of our in-house warehousing 

In [10]:
positive_tweet_tokens

[['#FollowFriday',
  '@France_Inte',
  '@PKuchly57',
  '@Milipol_Paris',
  'for',
  'being',
  'top',
  'engaged',
  'members',
  'in',
  'my',
  'community',
  'this',
  'week',
  ':)'],
 ['@Lamb2ja',
  'Hey',
  'James',
  '!',
  'How',
  'odd',
  ':/',
  'Please',
  'call',
  'our',
  'Contact',
  'Centre',
  'on',
  '02392441234',
  'and',
  'we',
  'will',
  'be',
  'able',
  'to',
  'assist',
  'you',
  ':)',
  'Many',
  'thanks',
  '!'],
 ['@DespiteOfficial',
  'we',
  'had',
  'a',
  'listen',
  'last',
  'night',
  ':)',
  'As',
  'You',
  'Bleed',
  'is',
  'an',
  'amazing',
  'track',
  '.',
  'When',
  'are',
  'you',
  'in',
  'Scotland',
  '?',
  '!'],
 ['@97sides', 'CONGRATS', ':)'],
 ['yeaaaah',
  'yippppy',
  '!',
  '!',
  '!',
  'my',
  'accnt',
  'verified',
  'rqst',
  'has',
  'succeed',
  'got',
  'a',
  'blue',
  'tick',
  'mark',
  'on',
  'my',
  'fb',
  'profile',
  ':)',
  'in',
  '15',
  'days'],
 ['@BhaktisBanter',
  '@PallaviRuhail',
  'This',
  'one',
  '

In [12]:
all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [13]:
all_pos_words

<generator object get_all_words at 0x000001C3BAA18D60>

In [14]:
freq_dist_pos = FreqDist(all_pos_words)

In [15]:
freq_dist_pos

FreqDist({':)': 3691, ':-)': 701, ':d': 658, 'thanks': 388, 'follow': 357, 'love': 333, '...': 290, 'good': 283, 'get': 263, 'thank': 253, ...})