# Exercise with Natural Language Processing

For todays exersice we will be doing two things.  The first is to build the same model with the same data that we did in the lecture, the second will be to build a new model with new data. 

## PART 1: 
- 20 Newsgroups Corpus


## PART 2:
- Republican vs Democrat Tweet Classifier

In [1]:
# Import pandas for data handling
import pandas as pd

# NLTK is our Natural-Language-Took-Kit
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
# Libraries for helping us with strings
import string
# Regular Expression Library
import re

# Import our text vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# Import our classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier


# Import some ML helper function
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report



# Import our metrics to evaluate our model
from sklearn import metrics


# Library for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# You may need to download these from nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Pain\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load and display data.
1. Load the 20-newsgroups.csv data into a dataframe.
1. Print the shape
1. Inspect / remove nulls and duplicates
1. Find class balances, print out how many of each topic_category there are.

In [2]:
# 1. Load the 20-newsgroups.csv data into a dataframe.
# 2. Print the shape
df = pd.read_csv('data/20-newsgroups.csv')
print(df.shape)
df.head()

(11314, 4)


Unnamed: 0,id,message,topic,topic_category
0,0,From: lerxst@wam.umd.edu (where's my thing)\r\...,7,rec.autos
1,1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,3,From: jgreen@amber (Joe Green)\r\nSubject: Re:...,1,comp.graphics
4,4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [3]:
# 3. Inspect / remove nulls and duplicates
print(df.isnull().sum())
print(df.duplicated().sum())

id                0
message           0
topic             0
topic_category    0
dtype: int64
0


In [4]:
# 4. Find class balances, print out how many of each topic_category there are.
df.topic_category.value_counts()

rec.sport.hockey            600
soc.religion.christian      599
rec.motorcycles             598
rec.sport.baseball          597
sci.crypt                   595
sci.med                     594
rec.autos                   594
comp.windows.x              593
sci.space                   593
sci.electronics             591
comp.os.ms-windows.misc     591
comp.sys.ibm.pc.hardware    590
misc.forsale                585
comp.graphics               584
comp.sys.mac.hardware       578
talk.politics.mideast       564
talk.politics.guns          546
alt.atheism                 480
talk.politics.misc          465
talk.religion.misc          377
Name: topic_category, dtype: int64

# Text Pre-Processing 
(aka Feature engineering)
1. Make a function that makes all text lowercase.
    * Do a sanity check by feeding in a test sentence into the function. 
    
    
2. Make a function that removes all punctuation. 
    * Do a sanity check by feeding in a test sentence into the function. 
    
    
3. Make a function that removes all stopwords.
    * Do a sanity check by feeding in a test sentence into the function. 
    
    
4. EXTRA CREDIT (This step only): Make a function that stemms all words. 


5. Mandatory: Make a pipeline function that applys all the text processing functions you just built.
    * Do a sanity check by feeding in a test sentence into the pipeline. 
    
    
    
6. Mandatory: Use `df['message_clean'] = df[column].apply(???)` and apply the text pipeline to your text data column. 

In [5]:
# 1. Make a function that makes all text lowercase.
def make_lower(t_string):
    t_string= t_string.lower()
    return t_string
test_string = 'This is A SENTENCE with LOTS OF CAPS.'
make_lower(test_string)


'this is a sentence with lots of caps.'

In [18]:
# 2. Make a function that removes all punctuation. 
remove_punctuation = str.maketrans("!?.#",4*" ")
t_string = []
_string = ' '.join()
test_string = 'This is a sentence! 50 With lots of punctuation??? & other #things.'
test_string.translate(t_string)


TypeError: sequence item 0: expected str instance, int found

In [26]:
# # 3. Make a function that removes all stopwords

def stem_words(t_string):
    # Initalize our Stemmer
    porter = PorterStemmer()
    words = word_tokenize(t_string)
    valid_words = []

    for word in words:
        stemmed_word = porter.stem(word)
        valid_words.append(stemmed_word)
        
    t_string = ' '.join(valid_words)

    return t_string 


a_sentence = 'I played and started playing with players and we all love to play with plays'
stem_words(a_sentence)

'I play and start play with player and we all love to play with play'

In [18]:
# 4. EXTRA CREDIT: Make a function that stemms all words. 

test_string = 'I played and started playing with players and we all love to play with plays'
# porter_stemmer = PorterStemmer()
#test_string = 'I played and started playing with players and we all love to play with plays'
# nltk_tokens = nltk.word_tokenize(test_string)
# for w in nltk_tokens:
#     print('Actual : %s  Stem:......>> %s ' % (w,porter_stemmer.stem(w)))

Actual : I  Stem:......>> I 
Actual : played  Stem:......>> play 
Actual : and  Stem:......>> and 
Actual : started  Stem:......>> start 
Actual : playing  Stem:......>> play 
Actual : with  Stem:......>> with 
Actual : players  Stem:......>> player 
Actual : and  Stem:......>> and 
Actual : we  Stem:......>> we 
Actual : all  Stem:......>> all 
Actual : love  Stem:......>> love 
Actual : to  Stem:......>> to 
Actual : play  Stem:......>> play 
Actual : with  Stem:......>> with 
Actual : plays  Stem:......>> play 


In [21]:
# 5. MANDATORY: Make a pipeline function that applys all the text processing functions you just built.

test_string = 'I played and started playing with players and we all love to play with plays'
def text_pipeline(t_string):
    t_string = make_lower(t_string)
    t_string = remove_punctuation(t_string)
    t_string = remove_stopwords(t_string)
    t_string = stem_words(t_string)
    return t_string


In [25]:
# 6. Mandatory: Use `df[column].apply(???)` and apply the text pipeline to your text data column. 
ddf['message_clean'] = df['message'].apply(string_pipeline)
df.head()
.....>>>>
df['message_clean'] = df['message'].apply(text_pipeline)
print("ORIGINAL TEXT:", df['message'][0])
print("CLEAN TEXT:", df['message_clean'][0])

NameError: name 'string_pipeline' is not defined

# Text Vectorization

1. Define your `X` and `y` data. 


2. Initialize a vectorizer (you can use TFIDF or BOW, it is your choice).
    * Do you want to use n-grams..?


3. Fit your vectorizer using your X data.
    * Remember, this process happens IN PLACE.


4. Transform your X data using your fitted vectorizer. 
    * `X = vectorizer.???`



5. Print the shape of your X.  How many features (aka columns) do you have?

In [80]:
# 1. Define your `X` and `y` data. 

X = df['message_clean'].values

y = df['topic_category'].values


KeyError: 'message_clean'

In [None]:
# 2. Initialize a vectorizer (you can use TFIDF or BOW, it is your choice).
vectorizer = TfidfVectorizer()
........>>>>>
vect = TfidfVectorizer()
vect.fit(X)
X = vect.transform(X)
..........>>>


In [None]:
# 3. Fit your vectorizer using your X data
Vectorizer.fit(X)

In [None]:
# 4. Transform your X data using your fitted vectorizer. 
X = Vectorizer.transform(X)

In [None]:
# 5. Print the shape of your X.  How many features (aka columns) do you have?
print(X,shape, type(X))

# Split your data into Training and Testing data. 

In [None]:
# Split our data into testing and training like always. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

___
# Build and Train Model
Use Multinomial Naive Bayes to classify these documents. 

1. Initalize an empty model. 
2. Fit the model with our training data.


Experiment with different alphas.  Use the alpha gives you the best result.

EXTRA CREDIT:  Use grid search to programmatically do this for you. 

In [None]:
# 1. Initalize an empty model. 
model = MultinomialNB(alpha=0.5)
....................>>>>
model = MultinomialNB()

In [None]:
# Fit our model with our training data.
model.fit(X_train, y_train)

# Evaluate the model.

1. Make new predicitions using our test data. 
2. Print the accuracy of the model. 
3. Print the confusion matrix of our predictions. 
4. Using `classification_report` print the evaluation results for all the classes. 



In [None]:
# 1. Make new predictions of our testing data. 
y_pred = model.predict(X_test)
.....>>>
y_pred = model.predict(X_test)

y_pred_proba = model.predict_proba(X_test)

In [None]:
# 2. Print the accuracy of the model. 
accuracy = model.score(X_test, y_test)

print("Model Accuracy: %f" % accuracy)

In [None]:
# 3. Plot the confusion matrix of our predictions
.....>>>>
from sklearn.metrics import plot_confusion_matrix

fig, ax = plt.subplots(figsize=(21, 21))

disp = plot_confusion_matrix(model, X_test, y_test,
                             display_labels=model.classes_,
                             cmap=plt.cm.Blues, ax=ax)
plt.xticks(rotation=90)
disp


In [None]:
# 4. Using `classification_report` print the evaluation results for all the classes. 

......>>>>

print(classification_report(y_test, y_pred, target_names=model.classes_))

# Manual predicition
Write a new sentence that you think will be classified as talk.politics.guns. 
1. Apply the text pipeline to your sentence
2. Transform your cleaned text using the `X = vectorizer.transform([your_text])`
    * Note, the `transform` function accepts a list and not a individual string.
3. Use the model to predict your new `X`. 
4. Print the prediction

In [None]:
my_sentence = 'ah ah ah'
......>>>>
# 1. Apply the text pipeline to your sentence
my_sentence = text_pipeline(my_sentence)
# 2. Transform your cleaned text using the `X = vectorizer.transform([your_text])`\
X = vect.transform([my_sentence])
# 3. Use the model to predict your new `X`. 
y_pred = model.predict(X)
# 4. Print the prediction
......>>>>>
my_sentence = "I like shooting guns on the weekend, I like the Second Amendment"

# 1. Apply the text pipeline to your sentence
my_sentence = text_pipeline(my_sentence)
# 2. Transform your cleaned text using the `X = vectorizer.transform([your_text])`\
my_sentence_vectorized = vectorizer.transform([my_sentence])
# 3. Use the model to predict your new `X`. 
prediction = model.predict(my_sentence_vectorized)
# 4. Print the prediction
print(prediction)



___
# PART 2: Twitter Data
This part of the exercise is un-guided on purpose.  

Using the `dem-vs-rep-tweets.csv` build a classifier to determine if a tweet was written by a democrat or republican. 

Can you get an f1-score higher than %82

In [None]:
# 1. Load the 20-newsgroups.csv data into a dataframe.
# 2. Print the shape
df = pd.read_csv('data/dem-vs-rep-tweets.csv')

.....>>>>
df = pd.read_csv('data/dem-vs-rep-tweets.csv')
print(df.shape)
print(df.isnull().sum())
.........>>>>
df = pd.read_csv('data/dem-vs-rep-tweets.csv')
print(df.shape)
df.head()

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()
df.duplicated().sum()

In [None]:
df['clean_tweet'] = df['Tweet'].apply(text_pipeline)
.......>>>>
df['Tweet_clean'] = df['Tweet'].apply(text_pipeline)
print("ORIGINAL TWEET:", df['Tweet'][0])
print("CLEAN TWEET:", df['Tweet_clean'][0])

In [None]:
X = df['clean_tweet']
y = df['Party']

In [None]:
vect = TfidfVectorizer()
vect.fit(X)
X = vect.transform(X)
....>>>>>
vectorizer = TfidfVectorizer()

vectorizer.fit(X)

X = vectorizer.transform(X)

print(X.shape, type(X))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=10)
.....>>>>
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = MultinomialNB(alpha=2.3)
.......>>>>>
model = MultinomialNB(alpha=.05)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

y_pred_proba = model.predict_proba(X_test)

accuracy = model.score(X_test, y_test)

print("Model Accuracy: %f" % accuracy)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)


In [None]:
accuracy = model.score(X_test, y_test)

print("Model Accuracy: %f" % accuracy)

print(classification_report(y_test, y_pred, target_names=model.classes_))

In [None]:
rfmodel = RandomForestClassifier(n_estimators=150, max_depth=20)
rfmodel.fit(X_train, y_train)

In [None]:

y_pred = rfmodel.predict(X_test)

accuracy =  rfmodel.score(X_test, y_test)

print("Model Accuracy: %f" % accuracy)

print(classification_report(y_test, y_pred, target_names=rfmodel.classes_))

In [None]:
new_tweet = 'Ronald Reagan is the best president of all time. He is great because of tax breaks'
new_tweet = text_pipeline(new_tweet)
X = vect.transform([new_tweet])
print(model.predict(X))