In [3]:
#Import the libraries 

import pandas as pd
import numpy as np
import json
import nltk
from nltk.corpus import stopwords
import nltk.data
from bs4 import BeautifulSoup as bs
import re
from tqdm import tqdm_notebook as tqdm
from gensim.models.word2vec import Word2Vec
from nltk.tokenize import WordPunctTokenizer
from kafka import KafkaConsumer
from kafka import KafkaProducer

In [14]:
#Load the dataset
#Here we are using Stanford's Sentiment140 dataset, that has 1.6 million tweets and their sentiment

cols = ['sentiment', 'id', 'date', 'query', 'user', 'tweet']
pd.set_option('display.max_colwidth', -1)  # or 199
df = pd.read_csv("../resources/training.1600000.processed.noemoticon.csv", 
                        header=None, 
                        names=cols,
                        encoding="latin-1"
                   )

In [15]:
#0 is negative sentiment and 1 is positive
df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})

In [16]:
#Checking out the dataset
df.head(10)

Unnamed: 0,sentiment,id,date,query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,"@LOLTrish hey long time no see! Yes.. Rains a bit ,only a bit LOL , I'm fine thanks , how's you ?"
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [17]:
#Only take relevant columns
train = df[['sentiment', 'tweet']]

In [18]:
train.head(10)

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
5,0,@Kwesidei not the whole crew
6,0,Need a hug
7,0,"@LOLTrish hey long time no see! Yes.. Rains a bit ,only a bit LOL , I'm fine thanks , how's you ?"
8,0,@Tatiana_K nope they didn't have it
9,0,@twittera que me muera ?


In [19]:
train.sentiment.value_counts()

1    800000
0    800000
Name: sentiment, dtype: int64

In [20]:
#With pre-clean-len we can check if all tweets are 140 characters or less
train.loc[:, "pre-clean-len"] = [len(t) for t in train.tweet]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [None]:
train[train["pre-clean-len"]>140].head(10)

In [None]:
#Removing usernames
text = bs(train.tweet[343], 'lxml').get_text()
re.sub(r'@[A-Za-z0-9]+', '', text)

In [None]:
train.tweet[0]

In [None]:
#Removing URLs
text = bs(train.tweet[0], 'lxml').get_text()
re.sub('https?://[A-Za-z0-9./]+', '', text)

In [None]:
#Removing any unparsable character in unicode
text = train['tweet'][226].encode('latin-1').decode('utf-8')
text.replace("�", "?")

In [None]:
train.tweet[175]

In [None]:
#Keep text only
text = train.tweet[175]
re.sub("[^a-zA-Z]", " ", text)

In [None]:
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[A-Za-z0-9./]+'
pat3 = r'www.[^ ]+'
combined_pat = r'|'.join((pat1, pat2, pat3))
neg_dict = {"isn't":"is not", "wasn't":"was not", "aren't":"are not", "weren't":"were not", "haven't":"have not", 
            "hasn't":"has not", "hadn't":"had not", "won't":"will not", "shalln't":"shall not",
            "don't":"do not", "doesn't":"does not", "didn't":"did not", "shouldn't":"should not",
            "wouldn't":"would not", "couldn't":"could not", "mightn't":"might not", 
            "musn't":"must not"}

neg_pat = re.compile(r"\b(" + "|".join(neg_dict.keys()) + r")/b")


#Clean text - only text remains ; remove URLs, contractions, usernames
def clean(text):
    soup = bs(text, 'lxml').get_text()0
    try:
        bom_removed = soup.encode('latin-1').decode('utf-8-sig').replace("�", "?")
    except:
        #Don't know, why this exception comes ? - TBD;
        bom_removed = soup 
    stripped_pat = re.sub(combined_pat, '', bom_removed).lower()
    stripped_neg = neg_pat.sub(lambda x: neg_dict[x.group()], stripped_pat)
    stripped_text = re.sub('[^a-zA-Z]', ' ', stripped_neg)
    words = tok.tokenize(stripped_text)
    words = " ".join(words).strip()
    return words

In [None]:
%%time
cleaned_tweets = []

#Clean the training dataset
def clean_tweets(df):
    pbar = tqdm(range(0,len(df)))
    for i in pbar:
        #if ((i+1)%10000 == 0):
        #    print("Tweets %d out of %d have been processed" %(i+1, len(df)))
        cleaned_tweets.append(clean(df['tweet'][i]))

In [None]:
'''
#Using threads to clean the training data
from _thread import start_new_thread

num = [0, 400000, 800000, 120000, 160000]
cleaned_tweets = []


def clean_tweets_list(i):
    for i in range(num[i], num[i+1]):
        if ((i+1)%10000 == 0):
            print("Tweets %d out of %d have been processed" %(i+1, num[1]))
        cleaned_tweets.append(clean(train['tweet'][i]))
        
start_new_thread(clean_tweets_list, (0,))
start_new_thread(clean_tweets_list, (1,))
start_new_thread(clean_tweets_list, (2,))
start_new_thread(clean_tweets_list, (3,))
'''

In [None]:
clean_tweets(df)

In [None]:
cleaned_tweets

In [None]:
#Create dataset of cleaned tweet text,and its sentiment 
clean_tweets_df = pd.DataFrame(cleaned_tweets, columns=['tweets'])
clean_tweets_df['sentiment'] = df.sentiment

In [None]:
clean_tweets_df['tweets'][208]

In [None]:
#clean_tweets_df[clean_tweets_df.isnull().any(axis=1)]
clean_tweets_df[clean_tweets_df['tweets']==""].head()

In [None]:
df.iloc[208]

In [None]:
#Check for blank tweets, then drop them
blank_tweets = clean_tweets_df[clean_tweets_df['tweets']==""]

In [None]:
clean_tweets_df.drop(blank_tweets.index, inplace=True)

In [None]:
#Drop NaN
clean_tweets_df.dropna(inplace=True)
clean_tweets_df.reset_index(inplace=True, drop=True)

In [None]:
clean_tweets_df.head()

In [None]:
#Save to csv 
clean_tweets_df.to_csv("clean_tweets.csv", encoding="utf-8", index=False)

#Send this csv to analyseTweets with Kafka -TBD

In [None]:
while True:
    consumer_records = KafkaConsumer(in_topic)
    producer = KafkaProducer(bootstrap_servers=broker)
    for in_msg in consumer_records:
        out_msg = clean(in_msg)
        producer.send(out_topic, out_msg.encode())