In [None]:
#Following the pickling notebooks from chuck's 7.07 lesson

In [20]:
import pandas as pd
import pickle
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [7]:
# read in the cleaned and pre-processed data from Project 3
df = pd.read_csv('~/dsi/submissions/Projects/project_3-master/cleaned_data/pre_proc_books_or_writing.csv').drop(columns = 'Unnamed: 0')

In [8]:
df.head()

Unnamed: 0,created_utc,text,type,subreddit
0,1616956496,"['hope', 'underrated', 'general', 'public']",comments,1
1,1616956492,"['try', 'short', 'stories', 'finished', 'produ...",comments,1
2,1616956472,"['talking', 'right']",comments,1
3,1616956387,"['isnt', 'cliché', 'fbi', 'doesnt', 'hire', 'c...",comments,1
4,1616956279,"['thats', 'good', 'point', 'probably', 'thinki...",comments,1


In [9]:
#create a dataframe with only submissions
submission_df = df[df['type'] == 'submission']
submission_df.shape

(37077, 4)

In [10]:
#pull a balanced sample
sample = submission_df.groupby('subreddit').sample(n = 15_000, random_state=42)

In [29]:
#make 'subreddit' descriptive again

sample['subreddit'] = sample['subreddit'].map({0: 'books', 1: 'writing'})

In [30]:
#set up X and y

X = sample['text']
y = sample['subreddit']

In [31]:
#check for baseline
y.value_counts(normalize=True)

books      0.5
writing    0.5
Name: subreddit, dtype: float64

In [32]:
#train-test split
X_train, X_test, y_train, y_test=train_test_split(X,
                                                 y,
                                                 test_size=.2,
                                                 stratify=y,
                                                 random_state=42)

In [33]:
## add in some stopwords 
my_stop = ['like', 'im', 'just', 'dont', 'ive']

In [34]:
pipe = Pipeline([
    ('tvec', TfidfVectorizer(max_features = 12_500, ngram_range =(1, 2), stop_words=my_stop)),
    ('logreg', LogisticRegression(C=0.1))
])

In [35]:
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9049583333333333, 0.9008333333333334)

In [36]:
pipe.named_steps

{'tvec': TfidfVectorizer(max_features=12500, ngram_range=(1, 2),
                 stop_words=['like', 'im', 'just', 'dont', 'ive']),
 'logreg': LogisticRegression(C=0.1)}

In [37]:
try:
    os.mkdir('./models') 
except: 
    pass

with open('./models/reddit_pipe.pkl', mode='wb') as pickle_out:
    pickle.dump(pipe, pickle_out)

In [38]:
#try to unpickle

with open('./models/reddit_pipe.pkl', mode='rb') as pickle_in:
    pipe = pickle.load(pickle_in)

In [39]:
pipe.named_steps

{'tvec': TfidfVectorizer(max_features=12500, ngram_range=(1, 2),
                 stop_words=['like', 'im', 'just', 'dont', 'ive']),
 'logreg': LogisticRegression(C=0.1)}

In [40]:
#hurrah!

In [41]:
#trying a prediction from /r/books
pipe.predict(['This is just something funny I realized yesterday when I went to my local Goodwill. I came across 3 Twilight books and realized that not only did I always see Twilight books at every thrift shop I go to, but they were always the same ones: Twilight, Eclipse, and Breaking Dawn (rarely ever see New Moon for some reason). Personally I‘m not really into the Twilight series anymore (middle school me would be screaming if she heard me say that) so instead of buying them I just kind of see them as a game. Everytime I go to a new thrift shop I search for them like “alright, where are you now?” And I always find one. So yeah.... that’s what I was thinking about today. If you also shop for books at thrift shops, are there any other books you always find there (The Fault in Our Stars is another book that seems to make a regular appearance)?'])

array(['books'], dtype=object)

In [44]:
#trying a prediction from /r/writing
pipe.predict(["Like you feel you have a really good story, but your writing ability would never do it justice? I'd be lying if I said I've never considered hiring an actual writer and just collaborating with them so the story is at least done right lol"])

array(['writing'], dtype=object)

In [55]:
user_text = input('Please enter your subreddit post:')
pipe.predict([user_text])

Please enter your subreddit post: Like you feel you have a really good story, but your writing ability would never do it justice? I'd be lying if I said I've never considered hiring an actual writer and just collaborating with them so the story is at least done right lol


array(['writing'], dtype=object)

In [None]:
## Photo by Oladimeji Ajegbile from Pexels