In [1]:
import pandas as pd
import numpy as np
import regex as re
import matplotlib.pyplot as plt

In [1]:
data = pd.read_csv("../input/large-random-tweets-from-pakistan/Random "
                   "Tweets from Pakistan- Cleaned- Anonymous.csv",encoding_errors = 'ignore')

### Exploratory Data Analysis

In [1]:
data.head()

In [1]:
data.shape

In [1]:
# As we are doing NLP related tasks, I'll keep only text column
data = data['full_text']

###  Removing empty values

In [1]:
data = data.dropna()

### Filtering & Cleaning

In [1]:
# regular exp to filter out urdu text
reg = re.compile(r'[\-\u06ff]+', re.UNICODE)
data = data.apply(lambda x: re.sub(reg, "", x))

In [1]:
# removing extra spaces
data = data.apply(lambda x: re.sub(r'[  ]+', " ", x))

# converting to lowercase letters 
data = data.apply(lambda x: x.strip().lower())

# remove hyperlinks
data = data.apply(lambda x: re.sub(r'https?:\/\/.*[\r\n]*', '', x))

# removing @Mentions
data = data.apply(lambda x:re.sub(r'@.+?\s', '', x))

# removing extra symbols
data = data.apply(lambda x: re.sub(r'#', '', x))
data = data.apply(lambda x: re.sub(r'rt : ', '', x))
data = data.apply(lambda x: re.sub(r'\n', ' ', x))

In [1]:
data.shape

In [1]:
# removing duplicated data
data = data.drop_duplicates()

In [1]:
data.shape

## NLP Preprocessing

In [1]:
import nltk
import string 
from nltk.corpus import stopwords  
from nltk.stem import PorterStemmer  
from nltk.tokenize import TweetTokenizer
import emoji
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')

# dictionary for lemmatization
nltk.download('wordnet')

### Tokenizing tweets

In [1]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                           reduce_len=True)
data = data.apply(tokenizer.tokenize)

### Removing stop words

In [1]:
stopwords_english = stopwords.words('english')

def clean(x):
    return [y for y in x if y not in stopwords_english and y not in string.punctuation
          and (len(y) > 1 or emoji.is_emoji(y)) ]

date = data.apply(clean)

### Stemming

In [1]:
stemmer = PorterStemmer()
def stem(x):
    return [stemmer.stem(y) for y in x]

#stemming
stemmed_tweets = data.apply(stem)

### Getting Features from already labelled Tweets

In [1]:
from nltk.corpus import twitter_samples

nltk.download('twitter_samples')

all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [1]:
def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [1]:
def build_freqs(tweets, ys):
    yslist = np.squeeze(ys).tolist()

    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

### Building positive and negative word frequencies

In [1]:
train_pos  = all_positive_tweets[:4000]
test_pos  = all_positive_tweets[4000:]
train_neg  = all_negative_tweets[:4000]
test_neg  = all_negative_tweets[4000:]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

freqs = build_freqs(train_x, train_y)

In [1]:
def train_naive_bayes(freqs, train_x, train_y):
    
    loglikelihood = {}
    logprior = 0

    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos and N_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:

            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs[pair]

        # else, the label is negative
        else:

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs[pair]

    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents (*hint: use sum(<np_array>))
    D_pos = (len(list(filter(lambda x: x > 0, train_y))))

    # Calculate D_neg, the number of negative documents (*hint: compute using D and D_pos)
    D_neg = (len(list(filter(lambda x: x <= 0, train_y))))

    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)

    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = lookup(freqs,word,1)
        freq_neg = lookup(freqs,word,0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)
        
    return logprior, loglikelihood

def lookup(freqs, word, label):
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)

### Testing on our tweet dataset

In [1]:
def NB_predict(tweet):
    p = 0
    p += logprior

    for token in tweet:
        if token in loglikelihood:
            p += loglikelihood[token]
    return p

In [1]:
sample_tweet = data[122]
sample_tweet

In [1]:
if NB_predict(sample_tweet) > 0:
    print("tweet convays positive sentiment")

else:
    print("tweet has negative sentiment")