In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
from nltk.corpus import stopwords

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import spacy

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [5]:
from sklearn.compose import ColumnTransformer

In [6]:
from sklearn.naive_bayes import MultinomialNB

In [7]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV, train_test_split

In [8]:
vegan = pd.read_csv('vegan.csv', encoding='latin-1')

In [9]:
vegan.head()

Unnamed: 0,title,selftext,created_utc
0,How do I get involved in Direct Action activism?,,1679864469
1,Is it vegan to buy non-vegan food with 'too go...,'Vegan' friend of mine did. And i honestly was...,1679864006
2,Hmmmm,,1679863672
3,has any damage been done if i havenât been s...,[removed],1679862449
4,Trying to be a vegatarian,[removed],1679861712


In [10]:
vegan['target'] = 0

In [11]:
vegan.head()

Unnamed: 0,title,selftext,created_utc,target
0,How do I get involved in Direct Action activism?,,1679864469,0
1,Is it vegan to buy non-vegan food with 'too go...,'Vegan' friend of mine did. And i honestly was...,1679864006,0
2,Hmmmm,,1679863672,0
3,has any damage been done if i havenât been s...,[removed],1679862449,0
4,Trying to be a vegatarian,[removed],1679861712,0


In [12]:
plant = pd.read_csv('plant.csv', encoding='latin-1')

In [13]:
plant['target'] = 1

In [14]:
for df in [vegan,plant]:
    df['length'] = df['selftext'].apply(lambda x: len(str(x)))
    df['word_count'] = df['selftext'].apply(lambda x: str(x).count(' '))
    df.dropna(inplace=True)

In [15]:
posts = pd.concat([vegan,plant],axis=0)

In [16]:
posts.head()

Unnamed: 0,title,selftext,created_utc,target,length,word_count
1,Is it vegan to buy non-vegan food with 'too go...,'Vegan' friend of mine did. And i honestly was...,1679864006,0,550,100
3,has any damage been done if i havenât been s...,[removed],1679862449,0,9,0
4,Trying to be a vegatarian,[removed],1679861712,0,9,0
6,Question for vegans with chickens,"Hi everyone, so Iâm vegan and rescued some c...",1679860400,0,1190,217
7,Crispy Thai chilies with sesame seeds by Chili...,[removed],1679859412,0,9,0


In [17]:
posts = posts.loc[posts['length'] > 9, :]

In [18]:
posts = posts.loc[posts['word_count'] > 2, :]

In [73]:
X = posts['selftext']
y = posts['target']

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify = y, 
                                                    random_state = 42)

In [75]:
X_train.shape

(18587,)

In [76]:
y_train.shape

(18587,)

In [77]:
y_train.value_counts(normalize=True)

0    0.692097
1    0.307903
Name: target, dtype: float64

In [78]:
nlp = spacy.load('en_core_web_md')

In [79]:
def spacy_tokenizer(words):
    words = nlp(words)
    tokens = [token.lemma_ for token in words if token.pos_.lower() not in ['aux','punct','cconj','det','space','conj','adp','pron']]
    return tokens

In [80]:
tk_stop = stopwords.words('english')

In [81]:
pipe_params = {
    'tvec__min_df': [2,3],
    'tvec__max_df': [.9,.95],
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words':['english',None]
}

In [83]:
pipe = Pipeline([
    ('tvec',TfidfVectorizer(tokenizer=spacy_tokenizer)),
    ('mnb',MultinomialNB())
])

In [84]:
gs = GridSearchCV(pipe,
                  param_grid = pipe_params,
                  cv=5,
                 verbose=1)

In [85]:
%time 

gs.fit(X_train, y_train)

Wall time: 0 ns
Fitting 5 folds for each of 16 candidates, totalling 80 fits




KeyboardInterrupt: 

In [152]:
cvec = CountVectorizer()

In [147]:
tfidf = TfidfVectorizer(input = 'content', tokenizer = spacy_tokenizer)

In [155]:
ctx = ColumnTransformer(transformers = [
    ('cvec', CountVectorizer(), ['title','selftext'])
],remainder = 'passthrough')

In [176]:
pipe = Pipeline(steps = [
    ('tfidf', TfidfVectorizer()),
    ('mulNB',MultinomialNB())
])

In [177]:
pipe.fit(X_train,y_train)

ValueError: Found input variables with inconsistent numbers of samples: [5, 18587]

In [None]:
df = []

for token in words:
    tk_dict = {
        'text': token.text,
        'POS': token.pos_,
        'POS_exp':spacy.explain(token.pos_),
        'dependency': token.dep_,
        'dep_exp': spacy.explain(token.dep_),
        'lemma': token.lemma_
    }
    df.append(tk_dict)