# 03 Pre-Processing

In [1]:
# Import libraries

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

import spacy

In [2]:
# Read clean data

vegan = pd.read_csv('../data/vegan_clean.csv')
plant = pd.read_csv('../data/plant_clean.csv')

In [3]:
# Set values for target

vegan['target'] = 0
plant['target'] = 1

In [4]:
# Concatenate posts

posts = pd.concat([vegan,plant], axis = 0)

In [5]:
# Check for nulls

posts.isnull().sum()

title             4
selftext          0
created_utc       0
post_length       0
title_length      0
post_words        0
title_words       0
avg_word_post     0
avg_word_title    0
target            0
dtype: int64

title             4
selftext          0
created_utc       0
post_length       0
title_length      0
post_words        0
title_words       0
avg_word_post     0
avg_word_title    0
target            0
dtype: int64

In [6]:
#  Drop nulls

posts.dropna(inplace=True)

In [7]:
# Select title and selftext as features

X = posts[['title','selftext']]
y = posts['target']

In [8]:
# Train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify = y, 
                                                    random_state = 42)

In [9]:
# View shape of data 

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((27291, 2), (27291,), (9097, 2), (9097,))

((27291, 2), (27291,), (9097, 2), (9097,))

In [10]:
# View distribution of target for baseline accuracy score

print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

0    0.720714
1    0.279286
Name: target, dtype: float64
0    0.720677
1    0.279323
Name: target, dtype: float64
0    0.720714
1    0.279286
Name: target, dtype: float64
0    0.720677
1    0.279323
Name: target, dtype: float64


Both train and target have approximately 72% vegan posts, so this is the baseline accuracy score that our models will attempt to beat.

In [11]:
#  Create spacy object

nlp = spacy.load('en_core_web_md')

In [12]:
stopwords = nlp.Defaults.stop_words

In [13]:
#  Create spacy tokenizer
#  Tokenize will lemmatize and exclude lemmas with POS unrelated to post content

class spacy_tokenizer():
    def __init__(self):
        self
    def __call__(self,post):
        doc = nlp(post)
        return [token.lemma_ for token in doc if 
                         token.pos_.lower() not in ['aux','punct','cconj','det','space','conj','adp','pron','sym']]

In [14]:
#  TF-IDF Vectorizer for titles and selftext

tvec_t = TfidfVectorizer(stop_words = stopwords, 
                         max_features = 1000,
                         min_df = 3,
                         max_df = .95,
                         ngram_range = (1,1),
                         tokenizer = spacy_tokenizer())

tvec_s = TfidfVectorizer(stop_words = stopwords, 
                         max_features = 1000,
                         min_df = 3,
                         max_df = .95,
                         ngram_range = (1,1),
                         tokenizer = spacy_tokenizer())


We are using small values for max_features and ngram_range and not checking combinations through GridSearch because of the computational constraints of the tokenizer and later constraints that would arise from combining the vectorizer to a pipeline. 

In [15]:
ctx = ColumnTransformer([
    ('tvec_t',tvec_t,'title'),
    ('tvec_s',tvec_s,'selftext')
],remainder = 'passthrough',n_jobs = -1)

In [16]:
ctx.fit(X_train)

ColumnTransformer(n_jobs=-1, remainder='passthrough',
                  transformers=[('tvec_t',
                                 TfidfVectorizer(max_df=0.95, max_features=1000,
                                                 min_df=3,
                                                 stop_words={"'d", "'ll", "'m",
                                                             "'re", "'s", "'ve",
                                                             'a', 'about',
                                                             'above', 'across',
                                                             'after',
                                                             'afterwards',
                                                             'again', 'against',
                                                             'all', 'almost',
                                                             'alone', 'along',
                                                             'al

ColumnTransformer(n_jobs=-1, remainder='passthrough',
                  transformers=[('tvec_t',
                                 TfidfVectorizer(max_df=0.95, max_features=1000,
                                                 min_df=3,
                                                 stop_words={"'d", "'ll", "'m",
                                                             "'re", "'s", "'ve",
                                                             'a', 'about',
                                                             'above', 'across',
                                                             'after',
                                                             'afterwards',
                                                             'again', 'against',
                                                             'all', 'almost',
                                                             'alone', 'along',
                                                             'al

In [17]:
X_train = ctx.transform(X_train)

In [18]:
X_test = ctx.transform(X_test)

In [19]:
#  Check shapes (should have 1000 columns each)

X_train.shape,X_test.shape

((27291, 2000), (9097, 2000))

((27291, 2000), (9097, 2000))

In [20]:
#  Reassemble as dataframes

X_train = pd.DataFrame(X_train.toarray(), columns = ctx.get_feature_names_out())
X_test = pd.DataFrame(X_test.toarray(), columns = ctx.get_feature_names_out())

In [23]:
# Write vectorized data to data folder

X_train.to_csv('../data/X_train_tvec.csv',index=False)
X_test.to_csv('../data/X_test_tvec.csv',index=False)

In [24]:
# Write targets to data folder

y_train.to_csv('../data/y_train.csv',index=False)
y_test.to_csv('../data/y_test.csv',index=False)