# NLP Preprocessing Notebook

The purpose of this notebook is to conduct the preprocessing steps that are necessary for text data in NLP. This includes tokenizing, removing stop words, vectorizing, etc.

In [60]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
%matplotlib inline
import nltk
from sklearn.feature_extraction import text 
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix, roc_curve, auc, classification_report
import pickle

### Loading in the clean dataframe from `data_cleaning.ipynb`


In [36]:
# this is our corpus
clean_df = pd.read_pickle('../pickle/clean_df.pkl')

In [37]:
clean_df.head()

Unnamed: 0,total_votes,hate_speech_votes,other_votes,label,tweet,round_1_tweet
0,3,0,3,0,!!! RT @mayasolovely: As a woman you shouldn't...,as a woman you shouldnt complain about clea...
1,3,0,3,0,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,boy dats coldtyga dwn bad for cuffin dat ho...
2,3,0,3,0,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,dawg you ever fuck a bitch and she sta to...
3,3,0,3,0,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,she look like a tranny
4,6,0,6,0,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,the shit you hear about me might be true or...


### Creating `tweet_df` with only cleaned tweets column

In [38]:
# tweet_df = pd.DataFrame(tweet_data)

tweet_df = clean_df[['round_1_tweet', 'label']].copy()

In [39]:
tweet_df.head()

Unnamed: 0,round_1_tweet,label
0,as a woman you shouldnt complain about clea...,0
1,boy dats coldtyga dwn bad for cuffin dat ho...,0
2,dawg you ever fuck a bitch and she sta to...,0
3,she look like a tranny,0
4,the shit you hear about me might be true or...,0


In [40]:
data = tweet_df['round_1_tweet']
target = tweet_df['label']

## Remove Stop Words & Tokenize
We can use NLTK's built-in library of stop words.

In [41]:
stop_words = set(stopwords.words('english'))

In [42]:
def process_tweet(text):
    tokens = nltk.word_tokenize(text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
    return stopwords_removed 

In [43]:
# applying the above function to our data/features 
processed_data = list(map(process_tweet, data))

In [44]:
total_vocab = set()
for comment in processed_data:
    total_vocab.update(comment)
len(total_vocab)

20277

## Lemmatization

In [45]:
# creating a list with all lemmatized outputs
lemmatizer = WordNetLemmatizer() 
lemmatized_output = []

for listy in processed_data:
    lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
    lemmatized_output.append(lemmed)

In [46]:
X_lem = lemmatized_output
y_lem = target

In [63]:
# pickle these for modeling
pickle_out = open('../pickle/X_lem.pkl','wb')
pickle.dump(X_lem, pickle_out)
pickle_out.close()

In [64]:
y_lem.to_pickle('../pickle/y_lem.pkl')

### Now `X_lem` and `y_lem` are ready to be brought over to the modeling notebooks.