### Importation des données 

In [1]:
import nltk
from nltk.corpus import twitter_samples

In [2]:
happy = twitter_samples.strings('positive_tweets.json')
print (len(happy)) 
 
sad = twitter_samples.strings('negative_tweets.json')
print (len(sad)) 

nane = twitter_samples.strings('tweets.20150430-223406.json')
print (len(nane))

5000
5000
20000


### Preparation des données

In [3]:
happy_tokens = twitter_samples.tokenized('positive_tweets.json')
sad_tokens = twitter_samples.tokenized('negative_tweets.json')
nane_tokens = twitter_samples.tokenized('tweets.20150430-223406.json')

In [4]:
import string
import re

In [5]:
from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')
 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
 
from nltk.tokenize import TweetTokenizer
 
#Positive smiles
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
# negative smiles
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
 
# all smiles
emoticons = emoticons_happy.union(emoticons_sad)

In [6]:
def clean_tweets(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
              word not in emoticons and # remove emoticons
                word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
 
    return tweets_clean

# Préparation du modéle

In [7]:
# order the features positive and negative 
def bag_of_words(tweet):
    words = clean_tweets(tweet)
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary
 

#all positive tweets in one list
pos_tweets_set = []
for tweet in happy:
    pos_tweets_set.append((bag_of_words(tweet), 'pos'))    

#all negative tweets in one list
neg_tweets_set = []
for tweet in sad:
    neg_tweets_set.append((bag_of_words(tweet), 'neg'))

#verify the length is the same as in the original one or no
print (len(pos_tweets_set), len(neg_tweets_set)) 

5000 5000


### Préparation des données pour le modèle d’apprentissage 

In [8]:
# choose the positive and negative tweets randomly
from random import shuffle 
shuffle(pos_tweets_set)
shuffle(neg_tweets_set)
#split the data in two parts : test and train 
test_set = pos_tweets_set[:1000] + neg_tweets_set[:1000]
train_set = pos_tweets_set[1000:] + neg_tweets_set[1000:]

#size of test and train
print(len(test_set),  len(train_set)) 

2000 8000


# Construction d’un modèle d’apprentissage 

### train

In [12]:
from nltk import classify
from nltk import NaiveBayesClassifier
 
classifier = NaiveBayesClassifier.train(train_set)
 
accuracy = classify.accuracy(classifier, test_set)
print(accuracy) # Output: 0.765
 
print (classifier.show_most_informative_features(10))   

0.7315
Most Informative Features
                     via = True              pos : neg    =     36.3 : 1.0
                      aw = True              neg : pos    =     25.0 : 1.0
                    glad = True              pos : neg    =     24.3 : 1.0
                     bam = True              pos : neg    =     23.7 : 1.0
                    cool = True              pos : neg    =     21.7 : 1.0
                     x15 = True              neg : pos    =     21.0 : 1.0
                    blog = True              pos : neg    =     18.3 : 1.0
                     sad = True              neg : pos    =     17.7 : 1.0
                opportun = True              pos : neg    =     15.0 : 1.0
               goodnight = True              pos : neg    =     14.3 : 1.0
None


### test

In [14]:
#test with a positive review
twt_pos = "it is bad . I hated the film. "
pos_tweet_set = bag_of_words(twt_pos)
print (classifier.classify(pos_tweet_set)) 

pos_result = classifier.prob_classify(pos_tweet_set)
print (pos_result) 
print (pos_result.max()) 
print (pos_result.prob("neg")) 
print (pos_result.prob("pos")) 
 
 
neg_tweet = "It is a great film.I loved it."
neg_tweet_set = bag_of_words(neg_tweet)
 
print (classifier.classify(neg_tweet_set)) 

 
neg_result = classifier.prob_classify(neg_tweet_set)
print (neg_result) 
print (neg_result.max()) 
print (neg_result.prob("neg")) 
print (neg_result.prob("pos")) 

neg
<ProbDist with 2 samples>
neg
0.8795697043904147
0.12043029560958547
pos
<ProbDist with 2 samples>
pos
0.030942659328954396
0.9690573406710452
