In [1]:
import numpy as np
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
### Source: https://spacy.io/usage/linguistic-features

def spacyPipeline(tweets):
    nlp = spacy.load('en_core_web_sm')
    
    preprocessed_tweets = []
    for t in tweets:
        doc = nlp(t)
        filtered_tweet = []
        
        for token in doc:
            if (not token.is_stop) and token.is_alpha:
                filtered_tweet.append(token.lemma_)
                
        preprocessed_tweets.append(filtered_tweet)
    
    return preprocessed_tweets

In [3]:
def convertClasses(c):
    classes = []

    neg = 0
    pos = 1
    
    for val in c:
        if pd.isnull(val) or val == 'N' or val == 'No':
            classes.append(neg)
        else:
            classes.append(pos)
    
    return classes

In [4]:
def preprocess(data):
    tweets = data['tweet']
    
    # Convert all to lowercase
    tweets = [t.lower() for t in tweets]
    
    # Process tweets through spaCy pipeline
    tweets = spacyPipeline(tweets)
    
    # Filter out words
    tweets = [list(filter(lambda w: w != 'link', t)) for t in tweets]
    
    # Remove words less than length 3
    tweets = [list(filter(lambda w: len(w) > 3, t)) for t in tweets]
    
    # Transform with TF-IDF
    transformer = TfidfVectorizer()
    tfidf_tweets = transformer.fit_transform([' '.join(t) for t in tweets])
    
    # Transform classes
    classes = convertClasses(data["existence"].tolist())
    
    return tfidf_tweets, classes