In [6]:
# Don't include names of people or cities
# Split words by where capitalized letter is

In [3]:
import re
import numpy as np
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
def splitWords(t):
    splitTweet = []
    t = t.split(' ')
    
    for word in t:
        res = re.search(r'[A-Z]', word)
        if res is not None:
            ch = word[res.span()[0]]
            words = re.split(r'[A-Z]', word)
            splitTweet.extend([words[0], ch + words[1]])
        else:
            splitTweet.extend(re.split(r'["|,;!|:*]', word))
    
    return ' '.join(splitTweet)

In [20]:
def checkWord(token):
    negations = ['not', 'no']
    fillers = ['link', 'http']
    return (not token.is_stop) and (token.is_alpha) and (token.text not in fillers) and ((token.text in negations) or (len(token.text) > 3))

In [37]:
### Source: https://spacy.io/usage/linguistic-features

def spacyPipeline(tweets):
    nlp = spacy.load('en_core_web_sm')
    
    MIN_TWEET_LEN = 5
    
    indices = []
    preprocessed_tweets = []
    for index, t in enumerate(tweets):
        
        doc = nlp(t)
        filtered_tweet = set()
        
        for token in doc:
            # print(token, " | ", spacy.explain(token.pos_))
            if (token.lemma_ not in filtered_tweet) and checkWord(token):
                filtered_tweet.add(token.lemma_.lower())
        
        if len(filtered_tweet) >= MIN_TWEET_LEN:
#             print(filtered_tweet,"\n---\n")
            preprocessed_tweets.append(filtered_tweet)
            indices.append(index)
    
    return preprocessed_tweets, indices

In [8]:
def convertClasses(c, indices):
    classes = []
    indices = set(indices)

    neg = 0
    pos = 1
    
    for index in indices:
        if pd.isnull(c[index]) or c[index] == 'N' or c[index] == 'No':
            classes.append(neg)
        else:
            classes.append(pos)
    
    return classes

In [14]:
def preprocess(data):
    tweets = data['tweet']
    
    # Convert all to lowercase
    tweets = [splitWords(t) for t in tweets]
    
    # Process tweets through spaCy pipeline
    tweets, indices = spacyPipeline(tweets)
    
    # Transform with TF-IDF
    transformer = TfidfVectorizer()
    tfidf_tweets = transformer.fit_transform([' '.join(t) for t in tweets])
    
    # Transform classes
    classes = convertClasses(data["existence"].tolist(), indices)
    
    return tfidf_tweets, classes