In [1]:
# import Beautiful Soup, NumPy and Pandas, etc
import bs4 as bs
import numpy as np
import pandas as pd
import re
import hashlib

from matplotlib import pyplot as plt
 
# download NLTK classifiers - these are cached locally on your machine
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# import ml classifiers
from nltk.tokenize import sent_tokenize # tokenizes sentences
from nltk.stem import PorterStemmer     # parsing/stemmer
from nltk.tag import pos_tag            # parts-of-speech tagging
from nltk.corpus import wordnet         # sentiment scores
from nltk.stem import WordNetLemmatizer # stem and context
from nltk.corpus import stopwords       # stopwords
from nltk.util import ngrams            # ngram iterator


# import sklearn
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vishn\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vishn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vishn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vishn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
train = pd.read_csv("news_sample.csv").iloc[:,[3,5]].dropna().reset_index(drop=True)

In [3]:
train.columns

Index(['type', 'content'], dtype='object')

In [4]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
eng_stopwords = set(stopwords.words("english"))


def article_cleaner(article, lemmatize=True, stem=False):

    #1. Remove HTML tags
    review = bs.BeautifulSoup(article,features='lxml').text

    #2. Use regex to find emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', article)

    #3. Remove punctuation
    article = re.sub("[^a-zA-Z]", " ",article)

    #4. Tokenize into words (all lower case)
    article_words = (str.lower(article.replace('.','. '))).split()

    #5. Remove stopwords, Lemmatize, Stem
    ### YOUR CODE HERE ##
        
    article_wo_stopwords = [w for w in article_words if not w in eng_stopwords]
    
    token_tag = pos_tag(article_wo_stopwords)
     
    def get_wordnet_pos(treebank_tag):

        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return 'n'
    
    wnl_stems = []
    
    for pair in token_tag:
        res = wnl.lemmatize(pair[0],pos=get_wordnet_pos(pair[1]))
        wnl_stems.append(res)
    
    for i in emoticons:
        wnl_stems.append(i)
    
    #6. Join the review to one sentence
    article_processed = ' '.join(wnl_stems)
    
    return article_processed

In [13]:
def get_vectorizer(ngram, max_features):
    return CountVectorizer(ngram_range=(1, ngram),
                             analyzer = "word",
                             tokenizer = None,
                             preprocessor = article_cleaner,
                             stop_words = None, 
                             max_features = max_features)

vectorizer = get_vectorizer(ngram=1, max_features=400)

In [7]:
y=train["type"] 
ngram=1
max_features=1000 
model_random_state=0

In [14]:
articles=train['content'] 

# train / test split
X_train, X_test, y_train, y_test = train_test_split(articles, y, random_state=0, test_size=.2)

# Then we use fit_transform() to fit the model / learn the vocabulary,
# then transform the data into feature vectors.

train_bag = vectorizer.fit_transform(X_train)
if not isinstance(train_bag, np.ndarray):
    train_bag = train_bag.toarray()
test_bag = vectorizer.transform(X_test)
if not isinstance(test_bag, np.ndarray):
    test_bag = test_bag.toarray()

In [15]:
print("Training the random forest classifier!\n")

# Initialize a Random Forest classifier with 50 trees
forest = RandomForestClassifier(max_features=55, n_estimators = 100, max_depth = 6 , random_state = model_random_state) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the target variable
forest = forest.fit(train_bag, y_train)

# predict
train_predictions = forest.predict(train_bag)
test_predictions = forest.predict(test_bag)

# validation
train_acc = metrics.accuracy_score(y_train, train_predictions)
valid_acc = metrics.accuracy_score(y_test, test_predictions)

print(" The training accuracy is: ", train_acc, "\n", "The validation accuracy is: ", valid_acc)
print()
print('CONFUSION MATRIX:')
print('         Predicted')
print('          neg pos')
print(' Actual')
c=confusion_matrix(y_test, test_predictions)
print('     neg  ',c[0])
print('     pos  ',c[1])



Training the random forest classifier!

 The training accuracy is:  0.968421052631579 
 The validation accuracy is:  0.875

CONFUSION MATRIX:
         Predicted
          neg pos
 Actual
     neg   [0 1 0 0 0 0 0 0]
     pos   [0 5 0 0 0 0 0 0]


In [16]:
inp = "Space lasers cause forest fires"
output = forest.predict(vectorizer.transform([inp]))
print(output)

['fake']


In [11]:
import pickle

In [22]:
pkl_filename = "random_forest_cv_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(forest, file)

In [21]:
pkl_filename2 = "count_vectorizer.pkl"
with open(pkl_filename2, 'wb') as file:
    pickle.dump(vectorizer, file)