# NLP Modeling

## Exercises

Do your work for this exercise in a file named model.



### Imports

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import unicodedata
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import acquire as a
import prepare as p

### 1. Take the work we did in the lessons further:
- What other types of models (i.e. different classifcation algorithms) could you use?
- How do the models compare when trained on term frequency data alone, instead of TF-IDF values?

### News

#### Acquire

In [2]:
news_df = a.get_all_news_articles(a.categories)



  soup = BeautifulSoup(response.text)


In [3]:
news_df.tail()

Unnamed: 0,title,content,category
144,Pope denounces 'aggressive' nationalism that r...,"Pope Francis denounced ""aggressive forms of na...",world
145,WHO to set up early pandemic warning centre in...,The World Health Organisation said that German...,world
146,China accuses G-7 of 'blatantly meddling' in i...,China has rejected criticism of its human righ...,world
147,Qatar Finance Minister arrested over alleged e...,Qatar's Finance Minister Ali Shareef Al Emadi ...,world
148,Gunmen shoot dead former news anchor in Afghan...,"Gunmen shot dead Nemat Rawan, a finance minist...",world


In [4]:
news_df.content = news_df.content.apply(p.clean)
news_df.head()

Unnamed: 0,title,content,category
0,"Godrej, PwC, Deloitte India give extra offs to...",several company india offering extra holiday e...,business
1,"Bill Gates' company Cascade transfers ₹13,300 ...",bill gate cascade investment holding company g...,business
2,Second COVID-19 wave hit India like a tsunami:...,biocon founder kiran mazumdarshaw said second ...,business
3,COVID-19 vaccine makers' shares fall after US ...,share covid19 vaccine maker plunged u backed p...,business
4,RIL may soon fly in Israeli experts to install...,reliance industry sought permission fly israel...,business


In [5]:
# We'll use this split function later to create in-sample and out-of-sample datasets for modeling
def split(df, stratify_by=None):
    """
    3 way split for train, validate, and test datasets
    To stratify, send in a column name
    """
    train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
    
    train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])
    
    return train, validate, test

In [6]:
train, validate, test = split(news_df, 'category')
train.head()

Unnamed: 0,title,content,category
137,"Trump reportedly creates new Twitter account, ...",twitter suspended account called djtdesk repor...,world
83,It was our best option: Disha on multi-platfor...,disha patani recently spoke upcoming film radh...,entertainment
43,US baseball team makes '1st' Dogecoin transact...,u baseball team oakland athletics president da...,sports
13,Pay your fair share of taxes: US lawmaker Jaya...,u lawmaker pramila jayapal responded tesla ceo...,business
51,"Bill Gates' company Cascade transfers ₹13,300 ...",bill gate cascade investment holding company g...,technology


In [7]:
# Setup our X variables
X_train = train.content
X_validate = validate.content
X_test = test.content

In [8]:
# Setup our y variables
y_train = train.category
y_validate = validate.category
y_test = test.category

In [9]:
X_train.head()


137    twitter suspended account called djtdesk repor...
83     disha patani recently spoke upcoming film radh...
43     u baseball team oakland athletics president da...
13     u lawmaker pramila jayapal responded tesla ceo...
51     bill gate cascade investment holding company g...
Name: content, dtype: object

In [10]:
# Create the tfidf vectorizer object
# Step 1, this creates a tf-idf values for each word, for each document
# Step 2, encodes these values so that we can use models that only work on numbers, like classifications model
tfidf = TfidfVectorizer()

# Fit on the training data
tfidf.fit(X_train)

# Use the object
X_train_vectorized = tfidf.transform(X_train)
X_validate_vectorized = tfidf.transform(X_validate)
X_test_vectorized = tfidf.transform(X_test)

In [11]:
# Sparse vectors/matrices have tons of zeros
X_train_vectorized.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.1323157 , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.14841866, ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [12]:
# Now that we have a vectorized dataset, we can use our classification tools!
lm = LogisticRegression()

# Fit the classification model on our vectorized train data
lm.fit(X_train_vectorized, y_train)

LogisticRegression()

In [13]:
#
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

In [14]:
train.head()


Unnamed: 0,actual
137,world
83,entertainment
43,sports
13,business
51,technology


In [15]:
# Use the trained model to predict y given those vectorized inputs of X
train['predicted'] = lm.predict(X_train_vectorized)
validate["predicted"] = lm.predict(X_validate_vectorized)
test['predicted'] = lm.predict(X_test_vectorized)

In [16]:
train.tail()


Unnamed: 0,actual,predicted
52,technology,technology
94,entertainment,entertainment
47,sports,sports
8,business,business
75,entertainment,entertainment


In [17]:
# Train Accuracy
(train.actual == train.predicted).mean()


0.927710843373494

In [18]:
# Out of sample accuracy
(validate.actual == validate.predicted).mean()


0.6388888888888888

### Codeup blogs

In [20]:
#need to debug the categories
#codeup_df = a.all_codeup_blogs()
#codeup_df

In [None]:
codeup_df.body = codeup_df.body.apply(p.clean)
codeup_df.head()

In [None]:
#tfidf = TfidfVectorizer()
#X = tfidf.fit_transform(codeup_df.text)
#y = df.label