In [1]:
import re
import nltk
import random
import pickle
import string
import numpy as np
import pandas as pd
from statistics import mode
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.classify import ClassifierI
from nltk.tokenize import word_tokenize
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier

### Reading Data

In [2]:
data = pd.read_csv("./input_data1/tweet_data.csv",names = ["target","tweet no","date","---","user","tweet"],encoding="latin-1")
data = data.drop(columns=["tweet no","date","---","user"])
data.head(10)

Unnamed: 0,target,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
5,0,@Kwesidei not the whole crew
6,0,Need a hug
7,0,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,@Tatiana_K nope they didn't have it
9,0,@twittera que me muera ?


In [11]:
# positive_tweets = data[data.target == 4]
# negative_tweets = data[data.target == 0]

# stop_words = list(set(stopwords.words('english')))
# nltk.download('averaged_perceptron_tagger')

# all_words = []
# documents = []

# for p in positive_tweets["tweet"]:
    
#     # create a list of tuples where the first element of each tuple is a review
#     # the second element is the label
#     documents.append( (p, "pos") )
    
#     # remove punctuations
#     cleaned = re.sub(r'[^(a-zA-Z)\s]','', p)
    
#     # tokenize 
#     tokenized = word_tokenize(cleaned)
    
#     # remove stopwords 
#     tweet = [w for w in tokenized if not w in stop_words]
    
#     ps = PorterStemmer()
#     tweet = [ ps.stem(word) for word in tweet ]
#     print (tokenized)
    
# for p in negative_tweets["tweet"]:
#         documents.append( (p, "neg") )
    
#     # remove punctuations
#         cleaned = re.sub(r'[^(a-zA-Z)\s]','', p)
    
#     # tokenize 
#         tokenized = word_tokenize(cleaned)
    
#     # remove stopwords 
#         stopped = [w for w in tokenized if not w in stop_words]
#         print (tokenized)

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/niharika/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/niharika/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/niharika/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### Data Preprocessing

In [4]:
all_words = []
stop_words = list(set(stopwords.words('english')))
tweet_pos = data[data.target == 4]
tweet_neg = data[data.target == 0]

In [None]:
for tweet in  tweet_pos['tweet']:
    #remove @username
    tweet = re.sub('@[^\s]+','',tweet)
    
    # Remove tickers
    tweet = re.sub(r'\$\w*', '', tweet)
    
    # To lowercase
    tweet = tweet.lower()
    
    # Remove hyperlinks starting with http*
    tweet = re.sub(r'https?:\/\/.*\/\w*','', tweet)
    
    # Remove hyperlinks starting with www.?*
    tweet = re.sub(r'www.[^ ]+','', tweet)
    
    # Remove hashtags
    tweet = re.sub(r'#\w*', '', tweet)
    
    # remove non ascii
    tweet = re.sub(r'[^\x00-\x7F]',' ', tweet)
    
    # Remove Punctuation and split 's, 't, 've with a space for filter
    tweet = re.sub(r'[' + string.punctuation.replace('@', '') + ']+', ' ', tweet)

    # remove anything that is not alphanumeric
    tweet = re.sub('[\W_]+', ' ', tweet)
    
    # tokenize 
    tweet = word_tokenize(tweet)
    
    # remove stopwords 
    tweet = [w for w in tweet if not w in stop_words]
    
    ps = PorterStemmer()
    tweet = [ ps.stem(word) for word in tweet ] 
    for w in tweet:
        if w not in all_words:
            all_words.append(w)
    

In [None]:
for tweet in  tweet_neg['tweet']:
    #remove @username
    tweet = re.sub('@[^\s]+','',tweet)
    
    # Remove tickers
    tweet = re.sub(r'\$\w*', '', tweet)
    
    # To lowercase
    tweet = tweet.lower()
    
    # Remove hyperlinks starting with http*
    tweet = re.sub(r'https?:\/\/.*\/\w*','', tweet)
    
    # Remove hyperlinks starting with www.?*
    tweet = re.sub(r'www.[^ ]+','', tweet)
    
    # Remove hashtags
    tweet = re.sub(r'#\w*', '', tweet)
    
    # remove non ascii
    tweet = re.sub(r'[^\x00-\x7F]',' ', tweet)
    
    # Remove Punctuation and split 's, 't, 've with a space for filter
    tweet = re.sub(r'[' + string.punctuation.replace('@', '') + ']+', ' ', tweet)

    # remove anything that is not alphanumeric
    tweet = re.sub('[\W_]+', ' ', tweet)
    
    # tokenize 
    tweet = word_tokenize(tweet)
    
    # remove stopwords 
    tweet = [w for w in tweet if not w in stop_words]
    
    ps = PorterStemmer()
    tweet = [ ps.stem(word) for word in tweet ] 
    for w in tweet:
        if w not in all_words:
            all_words.append(w)

### Feature Construction

In [26]:
all_words = nltk.FreqDist(all_words)

In [28]:
def find_features(word_features,document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [29]:
tweet_tuple = []

for tweet in tweet_pos['tweet']:
    tweet_tuple.append( (tweet, "pos") )
    
for tweet in tweet_neg['tweet']:
    tweet_tuple.append( (tweet, "neg") )

### Model Building and Outputs

In [37]:
print("NLTK RESULTS ON TWEETS: )
print("------------------------------------------")

num_features = range(10000,100001,10000):
for n in num_features:
    word_features = list(all_words.keys())[:n]
    train_val_test = [(find_features(word_features,tweet), target) for (tweet, target) in tweet_tuple]
    
    t1 = int(len(train_val_test)*0.98)
    t2 = int(len(train_val_test)*0.99)
    train = train_val[:t1]
    val = train_val[t1:t2]
    test = train_val[t2:]
    
    t0 = time()
    classifier = nltk.NaiveBayesClassifier.train(train)
    train_time = time() - t0
    
    print("Time to train:",train_time)
    print("Validation accuracy percent:",(nltk.classify.accuracy(classifier, val))*100)
    print("Test accuracy percent:",(nltk.classify.accuracy(classifier, test))*100)

Classifier accuracy percent: 100.0
