# PersonaGraph Text Classification Project

In [1]:
# link to repo: https://github.com/ryanlrappa/dsi-final-assessment-2/blob/master/assessment2.md

In [18]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from xgboost import XGBClassifier

In [3]:
os.getcwd()

'/Users/ryanrappa/Desktop/portfolio-projects/nlp-challenge/submissions'

In [4]:
# X: app descriptions
X_file = '/Users/ryanrappa/Desktop/portfolio-projects/nlp-challenge/data/train.txt'
X = np.array(open(X_file).read().splitlines())

# y (label): whether fitness app or not
y_file = '/Users/ryanrappa/Desktop/portfolio-projects/nlp-challenge/data/labels.txt'
y = np.array(open(y_file).read().splitlines())

In [5]:
# Stratified train-test split
# Stratified because the labels are not evenly distributed in the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, 
                                                    random_state=908,
                                                    stratify=y)

In [6]:
# Tokenize the text
count_vect = CountVectorizer()  #consider parameter tuning here?
X_train_counts = count_vect.fit_transform(X_train)

In [7]:
# Generate tf-idf

### nice explanation of tf-idf under "From occurrences to frequencies" here: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
### and in the first answer here: https://stats.stackexchange.com/questions/153069/bag-of-words-for-text-classification-why-not-just-use-word-frequencies-instead
tfidf_transformer = TfidfTransformer()  #consider parameter tuning here?
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [8]:
X_train_tfidf.shape

(1889, 29037)

In [9]:
# Train a model (classifier)
# Trying multinomial NB first
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [12]:
# Checking prediction accuracy on test set
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
preds = clf.predict(X_test_tfidf)
np.mean(preds == y_test)  #accuracy (pct of labels correctly predicted)

0.9044038668098818

In [13]:
# Trying SVM, using an sklearn Pipeline instead of the previous longer process

### all args default except for random state and max iter
### loss='hinge' results in SVM being used
### SGD means model is optimized by stochastic gradient descent
SVM_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=0.0001, random_state=908,
                          max_iter=5, tol=None))
])

In [14]:
SVM_clf.fit(X_train, y_train);



In [15]:
SVM_preds = SVM_clf.predict(X_test)

In [16]:
np.mean(SVM_preds == y_test)  #higher accuracy than multinomial NB
# ideally should do k-fold cross validation at some point

0.920515574650913

In [17]:
# Trying Bernoulli NB

BNB_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', BernoulliNB())
])

BNB_clf.fit(X_train, y_train);
BNB_preds = BNB_clf.predict(X_test)
np.mean(BNB_preds == y_test)  #accuracy slightly worse than MNB, worse than SVM

0.9022556390977443

In [None]:
## Trying XGBoost

xgb_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', XGBClassifier())
])



In [None]:
# next steps: 

### 1. try more models, such as those discussed here:
### https://medium.com/@chrisfotache/text-classification-in-python-pipelines-nlp-nltk-tf-idf-xgboost-and-more-b83451a327e0

### 2. grid search/optimize the model that performs best initially


## Analysis: