In [1]:
!pip install gensim



# Imports

In [25]:
# import Beautiful Soup, NumPy and Pandas, etc
import bs4 as bs
import numpy as np
import pandas as pd
import re
import hashlib
 
# download NLTK classifiers - these are cached locally on your machine
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw')

# import ml classifiers
from nltk.tokenize import sent_tokenize # tokenizes sentences
from nltk.stem import PorterStemmer     # parsing/stemmer
from nltk.tag import pos_tag            # parts-of-speech tagging
from nltk.corpus import wordnet         # sentiment scores
from nltk.stem import WordNetLemmatizer # stem and context
from nltk.corpus import stopwords       # stopwords
from nltk.util import ngrams            # ngram iterator

# import word2vec
#from gensim.test.utils import datapath
#from gensim import utils
#from gensim.models import Word2Vec

# import sklearn
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.preprocessing import normalize, FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/estella/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/estella/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/estella/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/estella/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw to /Users/estella/nltk_data...
[nltk_data]   Package omw is already up-to-date!


# Functions

In [3]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
eng_stopwords = set(stopwords.words("english"))


def review_cleaner(review, lemmatize=True, stem=False):
    '''
        Clean and preprocess a review.
            1. Remove HTML tags
            2. Extract emoticons
            3. Use regex to remove all special characters (only keep letters)
            4. Make strings to lower case and tokenize / word split reviews
            5. Remove English stopwords
            6. Lemmatize
            7. Rejoin to one string
        
        @review (type:str) is an unprocessed review string
        @return (type:str) is a 6-step preprocessed review string
    '''

    

    if lemmatize == True and stem == True:
        raise RuntimeError("May not pass both lemmatize and stem flags")

    #1. Remove HTML tags
    review = bs.BeautifulSoup(review,features='lxml').text

    #2. Use regex to find emoticons
    emoticons = re.findall('(:D|:\/)(?=\s|[^[:alnum:]+-]|$)', review)

    #3. Remove punctuation
    review = re.sub('[^a-zA-Z ]' ,'',review)

    #4. Tokenize into words (all lower case)
    review_words = (str.lower(review.replace('.','. '))).split()

    #5. Remove stopwords, Lemmatize, Stem
    
    review_wo_stopwords = [w for w in review_words if not w in eng_stopwords]
    
    token_tag = pos_tag(review_wo_stopwords)
    NN_count = 0
    JJ_count = 0

    for pair in token_tag:
        tag = pair[1]
        if tag == 'JJ':
            JJ_count+=1
        elif tag == 'NN':
            NN_count+=1
        
     
    def get_wordnet_pos(treebank_tag):

        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return 'n'
    
    wnl_stems = []
    
    for pair in token_tag:
        res = wnl.lemmatize(pair[0],pos=get_wordnet_pos(pair[1]))
        wnl_stems.append(res)
    
    for i in emoticons:
        wnl_stems.append(i)
    
    #6. Join the review to one sentence
    review_processed = ' '.join(wnl_stems)
    
    return review_processed

In [None]:
# We vectorize the text using a bag of words model
def get_vectorizer(ngram, max_features):
    return CountVectorizer(ngram_range=(1, ngram),
                             analyzer = "word",
                             tokenizer = None,
                             preprocessor = review_cleaner,
                             stop_words = None, 
                             max_features = max_features)

# Model training
def train_predict_sentiment(reviews, vectorizer, y=train["type"], ngram=1, max_features=1000, model_random_state=0):
    '''
        This function will:
            1. split data into train and test set.
            2. get n-gram counts from cleaned reviews 
            3. train a random forest model using train n-gram counts and y (labels)
            4. test the model on your test split
            5. print accuracy of sentiment prediction on test and training data
            6. print confusion matrix on test data results

            To change n-gram type, set value of ngram argument
            To change the number of features you want the countvectorizer to generate, set the value of max_features argument
            
            @cleaned_review (type:str) is preprocessed string from review_cleaner()
            @return none
    '''

    print("Creating the model!\n")
    
    # train / test split
    X_train, X_test, y_train, y_test = train_test_split(reviews, y, random_state=0, test_size=.2)

    # Then we use fit_transform() to fit the model / learn the vocabulary,
    # then transform the data into feature vectors.
    # The input should be a list of strings. .toarray() converts to a numpy array
    
    train_bag = vectorizer.fit_transform(X_train)
    if not isinstance(train_bag, np.ndarray):
        train_bag = train_bag.toarray()
    test_bag = vectorizer.transform(X_test)
    if not isinstance(test_bag, np.ndarray):
        test_bag = test_bag.toarray()

    print("Training the random forest classifier!\n")
    # Initialize a Random Forest classifier with 50 trees
    forest = RandomForestClassifier(n_estimators = 50, random_state = model_random_state) 

    # Fit the forest to the training set, using the bag of words as 
    # features and the sentiment labels as the target variable
    forest = forest.fit(train_bag, y_train)

    # predict
    train_predictions = forest.predict(train_bag)
    test_predictions = forest.predict(test_bag)
    
    # validation
    train_acc = metrics.accuracy_score(y_train, train_predictions)
    valid_acc = metrics.accuracy_score(y_test, test_predictions)
    
    print(" The training accuracy is: ", train_acc, "\n", "The validation accuracy is: ", valid_acc)
    print()
    print('CONFUSION MATRIX:')
    print('         Predicted')
    print('          neg pos')
    print(' Actual')
    c=confusion_matrix(y_test, test_predictions)
    print('     neg  ',c[0])
    print('     pos  ',c[1])

    return forest

#fhttps://stackoverflow.com/questions/57340142/user-warning-your-stop-words-may-be-inconsistent-with-your-preprocessing

# Cross-Validation

In [None]:
train = pd.read_csv("news_sample.csv", sep=",").iloc[:,[3,5]].dropna()
train

In [5]:
#split training and testing data
X_train, X_test, y_train, y_test = train_test_split(train['content'], train['type'], random_state=0, test_size=.2)

In [26]:
#kfold validation, stratification causes an error in splitting y
kf = KFold(5, shuffle=True,random_state=0)

#Pipeline model with vectorizer first
text_pipe = Pipeline([('tfid', TfidfVectorizer()),
                       ('rf', RandomForestClassifier( random_state = 0))])


In [43]:
#cross validation
cvparams = {'tfid__ngram_range' : [(1, 1), (1,2), (2,2)], #unigrams, uni and bi, only bigrams
            'tfid__stop_words': [None],
            'tfid__preprocessor': [review_cleaner],
            #'tfid__max_df': [0.8, 1], #When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words
            #'tfid__min_df':[0.01,0.001], #When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold
            'rf__max_depth': [500, 1000],
            'rf__min_samples_split': [3,5],
            'rf__min_samples_leaf': [1,5,8]}
    
            #'rf__n_estimators': (20,50,100)}#,
          #'rf__ccp_alpha': [0.0005, 0.001, 0.002, 0.01], #pruning cutoff
          #'rf__max_depth': range(3,6), #depth of tree
          #'#rf__max_features': range(7,15)} #max number of features to bootstrap per tree}
            
cv_rf = GridSearchCV(text_pipe, param_grid = cvparams, cv = kf, verbose = 1).fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [45]:
cv_rf.best_params_

{'rf__max_depth': 500,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'tfid__ngram_range': (1, 1),
 'tfid__preprocessor': <function __main__.review_cleaner(review, lemmatize=True, stem=False)>,
 'tfid__stop_words': None}

In [74]:
cv_rf.best_score_

0.6842105263157896

# New Model

In [61]:
bp = cv_rf.best_params_ 
#initial cv
#{'rf__max_depth': 500,
# 'rf__min_samples_leaf': 1,
# 'rf__min_samples_split': 2,
# 'tfid__ngram_range': (1, 1),
 #'tfid__preprocessor': review_cleaner,
# 'tfid__stop_words': None}


bestforest =  Pipeline([('tfid', TfidfVectorizer(ngram_range = bp["tfid__ngram_range"],
                                                 preprocessor = review_cleaner)),
                                                 #max_df = bp['tfid__max_df'],
                                                 #min_df = bp['tfid__min_df'])),
                       ('rf', RandomForestClassifier(max_depth = bp["rf__max_depth"],
                                                     min_samples_split = bp["rf__min_samples_split"],
                                                     min_samples_leaf = bp["rf__min_samples_leaf"],
                                                     random_state = 0))]).fit(X_train, y_train)

# predict
train_predictions = bestforest.predict(X_train)
test_predictions = bestforest.predict(X_test)
    
# validation
train_acc = metrics.accuracy_score(y_train, train_predictions)
valid_acc = metrics.accuracy_score(y_test, test_predictions)
    
print(" The training accuracy is: ", train_acc, "\n", "The test accuracy is: ", valid_acc)
print()
print('CONFUSION MATRIX:')
print('         Predicted')
print('          neg pos')
print(' Actual')
c=confusion_matrix(y_test, test_predictions)
print('     neg  ',c[0])
print('     pos  ',c[1])

 The training accuracy is:  1.0 
 The test accuracy is:  0.8125

CONFUSION MATRIX:
         Predicted
          neg pos
 Actual
     neg   [0 0 0 0 1 0 0 0]
     pos   [0 4 1 0 0 0 0 0]


In [72]:
phrase = ["Space lasers cause forest fires"]
output = bestforest.predict(phrase)
print(output)

['fake']


# Previous Model

In [56]:
vectorizer = get_vectorizer(ngram=1, max_features=100)
forest_model = train_predict_sentiment(train['content'] , vectorizer=vectorizer)

Creating the model!

Training the random forest classifier!

 The training accuracy is:  0.9947368421052631 
 The validation accuracy is:  0.8125

CONFUSION MATRIX:
         Predicted
          neg pos
 Actual
     neg   [0 1 0 0 0 0 0 0]
     pos   [0 5 0 0 0 0 0 0]


In [71]:
phrase = "Space lasers cause forest fires"
output = forest_model.predict(vectorizer.transform([phrase]))
print(output)

['unreliable']


In [1]:
import pickle