## Pre-Processing

In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
data = pd.read_csv('./data/total_cleaned_data.csv')

In [3]:
data

Unnamed: 0,subreddit,selftext,title
0,LifeProTips,,LPT: Be patient with yourself and your growth....
1,LifeProTips,"Every year, we see on our doorbell cam that on...",LPT: When introducing someone new in the U.S. ...
2,LifeProTips,,LPT: Spend some time in silence daily. Being o...
3,LifeProTips,"Bought an older, vacant house and septic compa...",LPT: mark the opening to your septic tank with...
4,LifeProTips,"For example, you can use the blue daylight set...",LPT: Having trouble “logging off” or winding d...
...,...,...,...
3995,UnethicalLifeProTips,[removed],ULPT: [from my father when I was a little kid]...
3996,UnethicalLifeProTips,,ULPT: When you’re leaving a parking garage wit...
3997,UnethicalLifeProTips,,"ULPT: If you need to come up with a fake name,..."
3998,UnethicalLifeProTips,The LDS Church famously helps their church mem...,"ULPT: Before your next move, make friends with..."


In [4]:
data['subreddit'] = data['subreddit'].map({'LifeProTips': 1, 'UnethicalLifeProTips': 0})

In [5]:
data

Unnamed: 0,subreddit,selftext,title
0,1,,LPT: Be patient with yourself and your growth....
1,1,"Every year, we see on our doorbell cam that on...",LPT: When introducing someone new in the U.S. ...
2,1,,LPT: Spend some time in silence daily. Being o...
3,1,"Bought an older, vacant house and septic compa...",LPT: mark the opening to your septic tank with...
4,1,"For example, you can use the blue daylight set...",LPT: Having trouble “logging off” or winding d...
...,...,...,...
3995,0,[removed],ULPT: [from my father when I was a little kid]...
3996,0,,ULPT: When you’re leaving a parking garage wit...
3997,0,,"ULPT: If you need to come up with a fake name,..."
3998,0,The LDS Church famously helps their church mem...,"ULPT: Before your next move, make friends with..."


In [6]:
#X = data.drop(columns = 'subreddit')
X = data['title']
y = data['subreddit']

In [7]:
y.value_counts(normalize = True)

1    0.5
0    0.5
Name: subreddit, dtype: float64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

---
### CountVectorizer

In [9]:
cvec = CountVectorizer(stop_words = 'english')
#cvec = CountVectorizer()

In [10]:
cvec.fit(X_train)

CountVectorizer(stop_words='english')

In [11]:
X_train = cvec.transform(X_train)

In [12]:
X_train.shape

(3000, 1407)

In [13]:
X_train.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
X_test = cvec.transform(X_test)

In [15]:
train_df = pd.DataFrame(X_train.todense(), # 'densified' the transformed data
                        columns = cvec.get_feature_names())
                        # get_feature_names on fitted cvec gets words learned from dataset based on 'columns'

train_df



Unnamed: 0,10,100,13,14,20,200,20hrs,2600,30,300,...,worthless,wrap,wrecks,write,xray,yard,years,yourname,youtube,zoos
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2997,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
train_df.sum().T.sort_values(ascending = False).head(10)

lpt      1500
ulpt     1488
don       609
use       422
want      366
need      320
free      319
phone     293
just      257
time      256
dtype: int64