# Capstone Binary Classification Attempt
Completed by following this [medium article](https://medium.com/analytics-vidhya/nlp-tutorial-for-text-classification-in-python-8f19cd17b49e)

Noah McIntire

## Imports and reading data

In [1]:
import pandas as pd
import numpy as np #for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')#for model-building
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer#for word embedding
# Word2Vec (may deal with this later)
#! pip install gensim

[nltk_data] Downloading package punkt to /Users/ryanlipps/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ryanlipps/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ryanlipps/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryanlipps/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_projects = pd.read_csv('../data/lc_train.csv')
test_projects = pd.read_csv('../data/lc_test.csv')

In [3]:
train_projects['text_len'] = train_projects.apply(lambda row: len(row.standardized_text.split(' ')), axis=1)
test_projects['text_len'] = test_projects.apply(lambda row: len(row.standardized_text.split(' ')), axis=1)

In [4]:
train_projects = train_projects.loc[train_projects.text_len < 512]
test_projects = test_projects.loc[test_projects.text_len < 512]

## Text Preprocessing

In [5]:
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text
 
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK pos (position) tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [6]:
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
train_projects['clean_text'] = train_projects['standardized_text'].apply(lambda x: finalpreprocess(x))
test_projects['clean_text'] = test_projects['standardized_text'].apply(lambda x: finalpreprocess(x))

In [7]:
X_train = train_projects.clean_text
y_train = train_projects.target
X_test = test_projects.clean_text
y_test = test_projects.target

In [8]:
#Tf-Idf
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

In [12]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression(tf-idf)
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  

#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]

print(classification_report(y_test,y_predict, digits=4))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

              precision    recall  f1-score   support

           0     0.6538    0.5152    0.5763       165
           1     0.8969    0.9393    0.9176       741

    accuracy                         0.8620       906
   macro avg     0.7754    0.7272    0.7469       906
weighted avg     0.8526    0.8620    0.8554       906

Confusion Matrix: [[ 85  80]
 [ 45 696]]
AUC: 0.8694965852860589
