In [43]:
import numpy as np
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
### Source: https://spacy.io/usage/linguistic-features

def spacyPipeline(tweets):
    nlp = spacy.load('en_core_web_sm')
    
    indices = []
    preprocessed_tweets = []
    for index, t in enumerate(tweets):
        doc = nlp(t)
        filtered_tweet = []
        
        # What about negations like "not"?
        # Contractions like "don't" or "can't"?
        for token in doc:
            #print(token, " | ", spacy.explain(token.pos_))
            if (not token.is_stop) and (token.is_alpha):# and (token.pos_ not in ["PROPN"]):
                if (len(token.text) > 3) and (token.text != "link") and (token.text not in filtered_tweet):
                    if token.text not in ["global", "warming", "climate", "change"]:
                        filtered_tweet.append(token.lemma_)
    
        if len(filtered_tweet) >= 4:
            # print("Filtered tweet: ", filtered_tweet)
            # print()
        
            preprocessed_tweets.append(filtered_tweet)
        else:
            indices.append(index)
    
    return preprocessed_tweets, indices

In [45]:
def convertClasses(c, indices):
    classes = []
    indices = set(indices)

    neg = 0
    pos = 1
    
    for index, val in enumerate(c):
        if index in indices:
            continue
        
        if pd.isnull(val) or val == 'N' or val == 'No':
            classes.append(neg)
        else:
            classes.append(pos)
    
    return classes

In [55]:
def preprocess(data):
    tweets = data['tweet']
    
    # Convert all to lowercase
    tweets = [t.lower() for t in tweets]
    
    # Process tweets through spaCy pipeline
    tweets, indices = spacyPipeline(tweets)
    
    # Transform with TF-IDF
    transformer = TfidfVectorizer()
    tfidf_tweets = transformer.fit_transform([' '.join(t) for t in tweets])
    
    # Transform classes
    classes = convertClasses(data["existence"].tolist(), indices)
    
    return tfidf_tweets, classes