# Text Classification 

## Data Cleaning & Text-Preprocessing 

In [None]:
import pandas as pd 
import nltk 
from nltk.corpus import stopwords
import re
import matplotlib.pyplot as plt 
import numpy as np
%matplotlib inline
import seaborn as sns
from datascience import percentile
import warnings
warnings.simplefilter("ignore")

In [None]:
frame = pd.read_csv('model-data.csv')

In [None]:
frame.head()

In [None]:
#Credit to Susan Li on Towards Data Science blog post

symbols_1 = re.compile('[/(){}\[\]\|@,;]') 
symbols_2 = re.compile('[^0-9a-z #+_]')
stopwords_set = set(stopwords.words('english'))



def preprocess(text):
    """ Takes in a string and returns cleaned string"""
    text = text.lower()
    text = symbols_1.sub(' ', text)
    text = symbols_2.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in stopwords_set)
    return text 

In [None]:
#Test preprocessing step:

preprocess(frame['content'][5])

In [None]:
# How many total words do we have? 
frame['content'].apply(lambda x: len(x.split(' '))).sum()

### Question: Does breitbart or nytimes articles, on average, contain more words? Would this bias our analysis if we use CountVectorizer?

In [None]:
# First, let's apply the pre-processings step to our articles
frame['content'] = frame['content'].apply(preprocess)
frame.head()

In [None]:
# now let's calculate the average number of words per outlet aricle
def get_length(text):
    words = text.split(' ')
    return len(words)

tester = frame.copy()
tester['num_words'] = tester['content'].apply(get_length)
tester.groupby('domain').mean()

#### Looks like nyt articles on average have more words than breitbart articles. This is good to know for future analysis. 

### Last step before modelling: Getting the data in the proper format. 

In [None]:
encodings = [1 if text == 'breitbart' else 0 for text in frame['domain']]
data = frame.drop(columns = 'domain')
data['label'] = encodings

In [None]:
data.head()

In [None]:
from sklearn.model_selection import train_test_split

X = data['content'].values
y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
# We will use the tf-idf vectorizer to encode our text as numerical vectors 
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words = 'english', max_features = tester.groupby('domain').max()['num_words']['nytimes'])
vectorizer.fit(list(data['content'].values))
X_train_cv = vectorizer.transform(X_train)
X_test_cv = vectorizer.transform(X_test)

In [None]:
# What are the dimensions of data matrix X_train and X_test? 
print(X_train_cv.shape)
print(X_test_cv.shape)

## Modeling 

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix

### Model #1: Naive Bayes 

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB().fit(X_train_cv, y_train)
y_pred = nb_classifier.predict(X_test_cv)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred, target_names=['breitbart', 'nytimes']))

In [None]:
df_cm = pd.DataFrame(confusion_matrix(y_test, y_pred), index = [i for i in ['breitbart', 'nytimes']],
                  columns = [i for i in ['breitbart', 'nytimes']])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='g')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Naive Bayes Classifier');

In [None]:
# Let's plot a ROC Curve 
plt.figure(figsize = (10, 7))

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

y_probs = nb_classifier.predict_proba(X_test_cv)
nb_probs = y_probs[:, 1]
ns_probs = [0 for _ in range(len(y_test))]
ns_auc = roc_auc_score(y_test, ns_probs)
nb_auc = roc_auc_score(y_test, nb_probs)

print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Naive Bayes: ROC AUC=%.3f' % (nb_auc))

ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
nb_fpr, nb_tpr, _ = roc_curve(y_test, nb_probs)

plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(nb_fpr, nb_tpr, marker='.', label='Naive Bayes')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend();

### Model #2: Linear Kernel Support Vector Machine 

In [None]:
from sklearn.linear_model import SGDClassifier

linsvm_classifier = SGDClassifier(loss = 'hinge', penalty = 'l2', tol = None, max_iter = 1000)
linsvm_classifier.fit(X_train_cv, y_train)
y_pred = linsvm_classifier.predict(X_test_cv)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred, target_names=['breitbart', 'nytimes']))

In [None]:
df_cm = pd.DataFrame(confusion_matrix(y_test, y_pred), index = [i for i in ['breitbart', 'nytimes']],
                  columns = [i for i in ['breitbart', 'nytimes']])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='g')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Linear Kernel SVM Classifier');

### Model #3: Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_cv, y_train)
y_pred = rf_classifier.predict(X_test_cv)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred, target_names=['breitbart', 'nytimes']))

In [None]:
df_cm = pd.DataFrame(confusion_matrix(y_test, y_pred), index = [i for i in ['breitbart', 'nytimes']],
                  columns = [i for i in ['breitbart', 'nytimes']])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='g')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Random Forest Classifier');

### Model #4: XGBoost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

xg_classifier = GradientBoostingClassifier()
xg_classifier.fit(X_train_cv, y_train)
y_pred = xg_classifier.predict(X_test_cv)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred, target_names=['breitbart', 'nytimes']))

In [None]:
df_cm = pd.DataFrame(confusion_matrix(y_test, y_pred), index = [i for i in ['breitbart', 'nytimes']],
                  columns = [i for i in ['breitbart', 'nytimes']])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='g')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Gradient Boost Classifier');

### Model #5: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg_classifier = LogisticRegression()
logreg_classifier.fit(X_train_cv, y_train)
y_pred = logreg_classifier.predict(X_test_cv)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred, target_names=['breitbart', 'nytimes']))

In [None]:
df_cm = pd.DataFrame(confusion_matrix(y_test, y_pred), index = [i for i in ['breitbart', 'nytimes']],
                  columns = [i for i in ['breitbart', 'nytimes']])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='g')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Logistic Regression Classifier');

In [None]:
# Let's plot a ROC Curve 
plt.figure(figsize = (10, 7))

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

y_probs = logreg_classifier.predict_proba(X_test_cv)
logreg_probs = y_probs[:, 1]
ns_probs = [0 for _ in range(len(y_test))]
ns_auc = roc_auc_score(y_test, ns_probs)
logreg_auc = roc_auc_score(y_test, logreg_probs)

print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic Regression: ROC AUC=%.3f' % (logreg_auc))

ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
logreg_fpr, logreg_tpr, _ = roc_curve(y_test, logreg_probs)

plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(logreg_fpr, logreg_tpr, marker='.', label='Logistic Regression')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend();

### The top models in decreasing order were: Linear Kernel SVM, Logistic Regression, Random Forest, XGBoost, Naive Bayes

### To get a more precise idea of the test accuracy, let's get 95% Confidence Intervals for the top 3 models. We will do this through bootstrapping.

In [None]:
def bootstrap_CI(model, X_data, y_data, num_repetitions, percent_confidence):
    """
    Steps:
    1. Iterate num_repetitions times 
    2. In each iteration, fit a new model to a new training set
    3. Generate new predictions based on a new test set
    4. Generate an accuracy score 
    5. Return 95% CI of accuracy scores 
    """
    bootstrap_statistics = np.array([])
    counter = 1
    for _ in np.arange(num_repetitions):
        print('This is the {}th iteration'.format(counter))
        X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25)
        vectorizer = TfidfVectorizer(stop_words = 'english', max_features = tester.groupby('domain').max()['num_words']['nytimes'])
        vectorizer.fit(list(frame['content'].values))
        X_train_cv = vectorizer.transform(X_train)
        X_test_cv = vectorizer.transform(X_test)
        classifier = model()
        classifier.fit(X_train_cv, y_train)
        y_predictions = classifier.predict(X_test_cv)
        score = np.mean(y_predictions == y_test)
        bootstrap_statistics = np.append(bootstrap_statistics, score)
        counter += 1
    low_end = ((100 - percent_confidence)/2)
    high_end = 100 - low_end
    low_score = percentile(low_end, bootstrap_statistics)
    high_score = percentile(high_end, bootstrap_statistics)
    return np.array([low_score, high_score])

In [None]:
bootstrap_CI(LogisticRegression, X, y, 100, 95)

In [None]:
bootstrap_CI(RandomForestClassifier, X, y, 100, 95)

In [None]:
bootstrap_CI(SGDClassifier, X, y, 100, 95)