### Imports 

In [10]:
import os
import math
import random
import numpy as np
import pandas as pd
import requests
import csv
import time
from bs4 import BeautifulSoup
import unicodedata

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline

import matplotlib.pyplot as plt
import matplotlib.image as mpimg #another version of opencv display image. I think it's just rendering things that have a height, width, and color dimensionality to them

from tensorflow.keras import layers
from tensorflow.keras import Model

import tensorflow as tf

from tensorflow.keras.preprocessing.image import ImageDataGenerator #using because this is a cnn and we want it to handle images in a fast manner
from tensorflow.keras.preprocessing.image import img_to_array, load_img

import nltk
from nltk.stem import WordNetLemmatizer

# Special
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")


In [2]:
data = pd.read_csv('./data/phys_chem_reddit_data.csv')

# Text Wrangling and Preprocessing
- using nltk and spacy

Outline:
- Start by using CountVectorizer to convert text data into a structured, numeric `X` dataframe
- Removing accented characters
- Expanding contractions (doesn't work because of Java error)
- Removing special characters
- Tokenizing
- Exploring stemming
- Exploring lemmatizing
- Removing stop words (adding a few that aren't there)

- Extract features from unstructured text by fitting and transforming with `CountVectorizer` and `TfidfVectorizer`.
- Describe how CountVectorizers and TF-IDFVectorizers work.
- Understand `stop_words`, `max_features`, `min_df`, `max_df`, and `ngram_range`.
- Implement `CountVectorizer` and `TfidfVectorizer` in a spam classification model.
- Use `GridSearchCV` and `Pipeline` with `CountVectorizer`.

#### Removing special characters

In [19]:
import re

In [20]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

remove_special_characters("Well this was fun! What do you think? 123#@!", remove_digits=True)
# from Dipanjan (DJ) Sarkar

'Well this was fun What do you think '

In [61]:
data['selftext_nochar'] = data.apply(lambda row: remove_special_characters(row['selftext'], remove_digits=True), axis = 1)

In [70]:
data['title_nochar'] = data.apply(lambda row: remove_special_characters(row['title'], remove_digits=True), axis = 1)

In [71]:
data['title'][0]

'Choice of GPU for running MD simulation (NAMD and GROMACS)'

In [72]:
data['title_nochar'][0]

'Choice of GPU for running MD simulation NAMD and GROMACS'

#### Removing accented characters

In [8]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

remove_accented_chars('Sómě Áccěntěd těxt')
# from Dipanjan (DJ) Sarkar

'Some Accented text'

In [65]:
data['selftext_noacccent'] = data.apply(lambda row: remove_special_characters(row['selftext_nochar'], remove_digits=True), axis = 1)

In [73]:
data['title_noacccent'] = data.apply(lambda row: remove_special_characters(row['title_nochar'], remove_digits=True), axis = 1)

In [74]:
data['title'][0]

'Choice of GPU for running MD simulation (NAMD and GROMACS)'

In [75]:
data['title_noacccent'][0]

'Choice of GPU for running MD simulation NAMD and GROMACS'

#### Expanding contractions

In [16]:
# from https://pypi.org/project/pycontractions/
#!pip install pycontractions
#get error: "to use the 'java' command-line tool you need to install a JDK"

#### Put them all together, create new dataframe with cleaned columns

In [77]:
df = data[['subreddit', 'selftext_noacccent', 'title_noacccent']].copy()

In [79]:
df.columns = ['subreddit', 'selftext', 'title']
df.head()

Unnamed: 0,subreddit,selftext,title
0,Physics,I am planning to set up a home PC with AMD Ryz...,Choice of GPU for running MD simulation NAMD a...
1,Physics,Ok So I have a bit of a bizarre question I bel...,Balance Bird vs Spin top
2,Physics,Would this account for spin Observations in q...,Do electrons have convective cores
3,Physics,According to the [Bekenstein bound]httpsenwiki...,Is quantum computing dangerous or impposible
4,Physics,So I understand conceptually the Big Rip but w...,On The Big Rip and The Mighty Quark


#### Tokenizing

In [25]:
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

In [26]:
# Instantiate Tokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [92]:
df['selftext_tokenized'] = df.apply(lambda row: tokenizer.tokenize(row['selftext']), axis = 1)

In [94]:
df['title_tokenized'] = df.apply(lambda row: tokenizer.tokenize(row['title']), axis = 1)

In [95]:
df.head()

Unnamed: 0,subreddit,selftext,title,selftext_tokenized,title_tokenized
0,Physics,I am planning to set up a home PC with AMD Ryz...,Choice of GPU for running MD simulation NAMD a...,"[I, am, planning, to, set, up, a, home, PC, wi...","[Choice, of, GPU, for, running, MD, simulation..."
1,Physics,Ok So I have a bit of a bizarre question I bel...,Balance Bird vs Spin top,"[Ok, So, I, have, a, bit, of, a, bizarre, ques...","[Balance, Bird, vs, Spin, top]"
2,Physics,Would this account for spin Observations in q...,Do electrons have convective cores,"[Would, this, account, for, spin, Observations...","[Do, electrons, have, convective, cores]"
3,Physics,According to the [Bekenstein bound]httpsenwiki...,Is quantum computing dangerous or impposible,"[According, to, the, Bekenstein, bound, httpse...","[Is, quantum, computing, dangerous, or, imppos..."
4,Physics,So I understand conceptually the Big Rip but w...,On The Big Rip and The Mighty Quark,"[So, I, understand, conceptually, the, Big, Ri...","[On, The, Big, Rip, and, The, Mighty, Quark]"


#### Lemmatizing

In [81]:
# Import lemmatizer. 
from nltk.stem import WordNetLemmatizer

# Instantiate lemmatizer. 
lemmatizer = WordNetLemmatizer()

In [117]:
#define funciton to lemmatize words in string
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in tokenizer.tokenize(text)]

In [118]:
df['selftext_lemmatized'] = df.apply(lambda row: lemmatize_text(row['selftext']), axis = 1)

In [119]:
df['title_lemmatized'] = df.apply(lambda row: lemmatize_text(row['title']), axis = 1)

#### Stopwords

In [129]:
# Import stopwords.
from nltk.corpus import stopwords

In [130]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sophiascarano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [131]:
#define funciton to remove stop words in string
def no_stop_text(text):
    text = [token for token in text if token not in stopwords.words('english')]
    return text

In [137]:
df['selftext_nostop'] = df.apply(lambda row: no_stop_text(row['selftext_lemmatized']), axis = 1)

In [135]:
df['title_nostop'] = df.apply(lambda row: no_stop_text(row['title_lemmatized']), axis = 1)

#### Stemming
- a bit of a crude way to lemmatize. Might prove to be useful

In [122]:
# Import stemmer.
from nltk.stem.porter import PorterStemmer

# Instantiate object of class PorterStemmer.
p_stemmer = PorterStemmer()

In [142]:
#define funciton to stem words in string
def stem_text(text):
    return [p_stemmer.stem(w) for w in text]

In [143]:
df['selftext_stemmed'] = df.apply(lambda row: stem_text(row['selftext_nostop']), axis = 1)

In [144]:
df['title_stemmed'] = df.apply(lambda row: stem_text(row['title_nostop']), axis = 1)

In [145]:
df.head()

Unnamed: 0,subreddit,selftext,title,selftext_tokenized,title_tokenized,selftext_lemmatized,title_lemmatized,selftext_stemmed,title_stemmed,title_nostop,selftext_nostop
0,Physics,I am planning to set up a home PC with AMD Ryz...,Choice of GPU for running MD simulation NAMD a...,"[I, am, planning, to, set, up, a, home, PC, wi...","[Choice, of, GPU, for, running, MD, simulation...","[I, am, planning, to, set, up, a, home, PC, wi...","[Choice, of, GPU, for, running, MD, simulation...","[I, plan, set, home, PC, amd, ryzen, X, proces...","[choic, gpu, run, MD, simul, namd, gromac]","[Choice, GPU, running, MD, simulation, NAMD, G...","[I, planning, set, home, PC, AMD, Ryzen, X, pr..."
1,Physics,Ok So I have a bit of a bizarre question I bel...,Balance Bird vs Spin top,"[Ok, So, I, have, a, bit, of, a, bizarre, ques...","[Balance, Bird, vs, Spin, top]","[Ok, So, I, have, a, bit, of, a, bizarre, ques...","[Balance, Bird, v, Spin, top]","[Ok, So, I, bit, bizarr, question, I, believ, ...","[balanc, bird, v, spin, top]","[Balance, Bird, v, Spin, top]","[Ok, So, I, bit, bizarre, question, I, believe..."
2,Physics,Would this account for spin Observations in q...,Do electrons have convective cores,"[Would, this, account, for, spin, Observations...","[Do, electrons, have, convective, cores]","[Would, this, account, for, spin, Observations...","[Do, electron, have, convective, core]","[would, account, spin, observ, quantum, mechan...","[Do, electron, convect, core]","[Do, electron, convective, core]","[Would, account, spin, Observations, quantum, ..."
3,Physics,According to the [Bekenstein bound]httpsenwiki...,Is quantum computing dangerous or impposible,"[According, to, the, Bekenstein, bound, httpse...","[Is, quantum, computing, dangerous, or, imppos...","[According, to, the, Bekenstein, bound, httpse...","[Is, quantum, computing, dangerous, or, imppos...","[accord, bekenstein, bound, httpsenwikipediaor...","[Is, quantum, comput, danger, imppos]","[Is, quantum, computing, dangerous, impposible]","[According, Bekenstein, bound, httpsenwikipedi..."
4,Physics,So I understand conceptually the Big Rip but w...,On The Big Rip and The Mighty Quark,"[So, I, understand, conceptually, the, Big, Ri...","[On, The, Big, Rip, and, The, Mighty, Quark]","[So, I, understand, conceptually, the, Big, Ri...","[On, The, Big, Rip, and, The, Mighty, Quark]","[So, I, understand, conceptu, big, rip, I, can...","[On, the, big, rip, the, mighti, quark]","[On, The, Big, Rip, The, Mighty, Quark]","[So, I, understand, conceptually, Big, Rip, I,..."


## Mapping values: 'subreddit' column

In [157]:
df['subreddit'] = df['subreddit'].map({'Physics': 1, 'chemistry': 0})

In [159]:
df.head(1)

Unnamed: 0,subreddit,selftext,title,selftext_tokenized,title_tokenized,selftext_lemmatized,title_lemmatized,selftext_stemmed,title_stemmed,title_nostop,selftext_nostop,test
0,1,I am planning to set up a home PC with AMD Ryz...,Choice of GPU for running MD simulation NAMD a...,"[I, am, planning, to, set, up, a, home, PC, wi...","[Choice, of, GPU, for, running, MD, simulation...","[I, am, planning, to, set, up, a, home, PC, wi...","[Choice, of, GPU, for, running, MD, simulation...","[I, plan, set, home, PC, amd, ryzen, X, proces...","[choic, gpu, run, MD, simul, namd, gromac]","[Choice, GPU, running, MD, simulation, NAMD, G...","[I, planning, set, home, PC, AMD, Ryzen, X, pr...",1


### Create new dataframes for ease of use
- in both dataframes, stop words, accents, and numbers were removed

In [193]:
df_nostop_lemmatized = df[['subreddit', 'selftext_nostop', 'title_nostop']].copy()
df_nostop_lemmatized.columns = ['subreddit', 'selftext', 'title']

#convert from list to string for columns
df_nostop_lemmatized['selftext'] = df_nostop_lemmatized['selftext'].apply(' '.join)
df_nostop_lemmatized['title'] = df_nostop_lemmatized['title'].apply(' '.join)

df_nostop_lemmatized.head()

Unnamed: 0,subreddit,selftext,title
0,1,I planning set home PC AMD Ryzen X processor G...,Choice GPU running MD simulation NAMD GROMACS
1,1,Ok So I bit bizarre question I believe I under...,Balance Bird v Spin top
2,1,Would account spin Observations quantum mechan...,Do electron convective core
3,1,According Bekenstein bound httpsenwikipediaorg...,Is quantum computing dangerous impposible
4,1,So I understand conceptually Big Rip I cant ge...,On The Big Rip The Mighty Quark


In [194]:
df_nostop_stemmed = df[['subreddit', 'selftext_stemmed', 'title_stemmed']].copy()
df_nostop_stemmed.columns = ['subreddit', 'selftext', 'title']

#convert from list to string for columns
df_nostop_stemmed['selftext'] = df_nostop_stemmed['selftext'].apply(' '.join)
df_nostop_stemmed['title'] = df_nostop_stemmed['title'].apply(' '.join)

df_nostop_stemmed.head()

Unnamed: 0,subreddit,selftext,title
0,1,I plan set home PC amd ryzen X processor GB ra...,choic gpu run MD simul namd gromac
1,1,Ok So I bit bizarr question I believ I underst...,balanc bird v spin top
2,1,would account spin observ quantum mechan gener...,Do electron convect core
3,1,accord bekenstein bound httpsenwikipediaorgwik...,Is quantum comput danger imppos
4,1,So I understand conceptu big rip I cant get he...,On the big rip the mighti quark


#### covert these to csvs

In [223]:
df_nostop_lemmatized.to_csv(r'./data/lemmatized_nostop_data.csv', index = False)
df_nostop_stemmed.to_csv(r'./data/stemmed_nostop_data.csv', index = False)

## `CountVectorizer`

#### Using lemmatized data, only selftext

In [211]:
X = df_nostop_lemmatized['selftext']
y = df_nostop_lemmatized['subreddit']

In [212]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42)

In [213]:
# Instantiate a CountVectorizer.
cvec = CountVectorizer(stop_words = 'english', ngram_range = (1,2))

In [214]:
# Fit the vectorizer on our corpus.
cvec.fit(X_train)

CountVectorizer(ngram_range=(1, 2), stop_words='english')

In [215]:
# Transform the corpus.
X_train = cvec.transform(X_train)

In [201]:
# Convert X_train into a DataFrame.
# We will not actually use this for modeling,
# this is just to visualize what is happening
X_train_df = pd.DataFrame(X_train.toarray(),
                          columns=cvec.get_feature_names())
X_train_df

Unnamed: 0,____,____ ____,____ froze,________,________ gt,__main__,__main__ main,__name__,__name__ __main__,_a,...,zumdahl,zumdahl content,zwiebachs,zwiebachs course,zwitterion,zwitterion ha,zygote,zygote dividing,zzzzz,zzzzz programm
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2571,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2572,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2573,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2574,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [216]:
X_test = cvec.transform(X_test)

## Using `GridSearchCV`

In [217]:
from sklearn.linear_model import LogisticRegression

In [218]:
pipe_params = {
    'countvectorizer__max_features': [2000, 3000, 4000, 5000],
    'countvectorizer__min_df': [2, 3],
    'countvectorizer__max_df': [.9, .95],
    'countvectorizer__ngram_range': [(1,1), (1,2)]
}
clf = LogisticRegression()
cvect = CountVectorizer()

In [219]:
from sklearn.pipeline import make_pipeline

In [220]:
pipe = make_pipeline(cvect, clf)

In [221]:
# Instantiate GridSearchCV.

gs = GridSearchCV(pipe, # what object are we optimizing?
                  pipe_params, # what parameters values are we searching?
                  cv = 5, # 5-fold cross-validation.
                n_jobs = -1
                 ) 

In [222]:
# Fit GridSearch to training data.
gs.fit(X_train, y_train)

AttributeError: lower not found