# Text Cleaning and Feature Extraction

In [1]:
# use data from kaggle Impermium competition
# https://www.kaggle.com/c/detecting-insults-in-social-commentary/data?train.csv

# download train.csv as the training set and
# impermium_verification_labels.csv as the test set

In [2]:
# Data
import pandas as pd
import numpy as np
import csv
import os

# Text Data
from bs4 import BeautifulSoup
import re
import nltk
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from io import StringIO


# Plotting
%matplotlib inline
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from pylab import savefig


# Analysis with Random Forests 

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_curve, auc

# Analysis with Support Vector Machines
from sklearn import svm

# for timing our large calls
import time

# for randomly sampling from data
import random

# split into training and test
from sklearn.cross_validation import train_test_split

#
from collections import Counter


In [3]:
# Set the appropriate path

path = '/Users/Seth/Documents/DSI/notes/DS-6001/Python/capstone-homework/'

#path = '/Users/deb/Dropbox/department/Classes/DS6001/DS6001_2016\
#/Lectures/Session17_TextFeatures/'


# IF YOU JUST WANT TO LOAD THE DATA FRAME FROM FILE, SKIP AHEAD
### to the cell below that says "...OR LOAD THE DATA FRAME FROM FILE"
### Or proceed if you want to generate from the raw data (takes 1-2 hours to process)

In [40]:

# ## Stopwords
# Include 
import nltk

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("maxent_treebank_pos_tagger")


# ### Stemming 
# 

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()


# ## Lemmatizing
# 

from nltk.stem import PorterStemmer, WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
    
# Function for POS tagging

def get_wordnet_pos(tagged):
            treebank_tag = nltk.pos_tag(tagged)
            if treebank_tag[0][1].startswith('J'):
                return '_pos_' + wordnet.ADJ
            elif treebank_tag[0][1].startswith('V'):
                return '_pos_' + wordnet.VERB
            elif treebank_tag[0][1].startswith('N'):
                return '_pos_' + wordnet.NOUN
            elif treebank_tag[0][1].startswith('R'):
                return '_pos_' + wordnet.ADV
            else:
                #return '_pos_' + wordnet.NOUN
                return '_pos_UNK'

[nltk_data] Downloading package stopwords to /Users/Seth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Seth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /Users/Seth/nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!


In [41]:
def clean_text(text, method = 'words'):
    #method='words'
    
    #parse html
    text = BeautifulSoup(str(text), "html.parser")
    
    #tag emails
    text = re.sub(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}', '_EM', str(text))

    # Replace url's with tags
    text = re.sub(r'\w+:\/\/\S+', r'_U', text)

    # Replace unicode spacing with space
    # Here we just use replace
    text = text.replace('\\xa0', ' ')

    # Replace single quotes and quotes with space or no space
    # use replace
    text = text.replace('\'', '')
    text = text.replace('"', ' ')
    
    # Replace new line commands
    text = text.replace('\\n', ' ')

    # Replace underscores and hyphens with space

    text = text.replace('_', ' ')
    text = text.replace('-', ' ')

    # Replace extra white space
    text = re.sub(' +',' ', text)
    text = text.replace('\'', '') # Notice we will need this


    # ### Punctuation
    # Replace punctuation strings with tag _SS\n
    text = re.sub(r'([^\.])(\.{2,})', r'\1 _SS\n', text)

    # strings of questions
    text = re.sub(r'([^!\?])(\?{2,})(\Z|[^!\?])', r'\1 _BQ\n\3', text)

    # Finding and tagging questions
    text = re.sub(r'([^!\?])\?(\Z|[^!\?])', r'\1 _Q\n\2',text)

    # Exclamation mark tagging
    text = re.sub(r'([^!\?])!(\Z|[^!\?])', r'\1 _X\n\2', text)

    # Extra letters
    text = re.sub(r'([a-zA-Z])\1\1+(\w*)', r'\1_EL', text)

    # Removing periods in acronyms
    text = re.sub(r'(\w+)\.(\w+)', r'\1\2', text)
    text = re.sub(r'(\w+)\.(\w+)', r'\1\2', text)

    # Tagging character stings used as expressions
    # likely used as a swear word
    text = re.sub(r'([#%&\*\$]{2,})(\w*)', r'\1\2 _SW', text)

    # Removing numbers
    text = re.sub('[1|2|3|4|5|6|7|8|9|0]', '_NUM', text)
    
    # ID and tokenize smileys
    # Big smile
    text = re.sub(r' [8x;:=]-?(?:\)|\}|\]|>){2,}', r' _BS', text)
    text = re.sub(r' (?:[;:=]-?[\)\}\]d>])|(?:<3)', r' _S', text)
    text = re.sub(r' [x:=]-?(?:\(|\[|\||\\|/|\{|<){2,}', r' _BF', text)
    text = re.sub(r' [x:=]-?[\(\[\|\\/\{<]', r' _F', text)

    #################
    #return text
    #################

    
    #################
    # NOW TOKE
    #################

    phrases = re.split(r'[;:\.()\n]', text)
    phrases = [re.findall(r'[\w%\*&#]+', ph) for ph in phrases]
    phrases = [ph for ph in phrases if ph]

    words = []  

    for ph in phrases:
       words.extend(ph)

    # Look for and remove single letters sequences
    # Try this one, but run it through
    # the previous cell first.

    tmp = words
    words = []
    new_word = ''
    for word in tmp:
       if len(word) == 1:
          new_word = new_word + word
       else:
          if new_word:
             words.append(new_word)
             new_word = ''
          words.append(word)
        
    # Remove stopwords
    words = [w for w in words if not w in stopwords.words("english")]
    words


    if method == "words":
        return words
    elif method == "pos":
        words_pos = []
        for word in words:
            words_pos.append(get_wordnet_pos(word))
        return words_pos
    elif method == "stem":
        words_stem = []
        for word in words:
            words_stem.append(stemmer.stem(word))
        return words_stem
    elif method == "lemma":
        words_lemma = []
        for word in words:
            words_lemma.append(lemmatizer.lemmatize(word))
        return words_lemma
    elif method == 'all':
        # pos
        words_pos = []
        for word in words:
            words_pos.append(get_wordnet_pos(word))
        # stem
        words_stem = []
        for word in words:
            words_stem.append(stemmer.stem(word))
        # lemma
        words_lemma = []
        for word in words:
            words_lemma.append(lemmatizer.lemmatize(word))
        #
        return [words, words_pos, words_stem, words_lemma]
    elif method == 'pos stem':
        # pos
        words_pos = []
        for word in words:
            words_pos.append(get_wordnet_pos(word))
        # stem
        words_stem = []
        for word in words:
            words_stem.append(stemmer.stem(word))

        return words_stem + words_pos
    else:
        print("NO/INVALID METHOD SUPPLIED, returning words")
        return words



In [42]:
rawPath = '/Users/Seth/Documents/DSI/Capstone/DSI-Religion-2017/data_dsicap_FULL/'

groups = os.listdir(rawPath)

# remove these red herrings if necessary
naw = ['.DS_Store', 'test_train', 'norun', 'fake_data', 'ref']
[groups.remove(x) for x in groups if x in naw]
#
rawFileList=[]
rawGroupList=[]
rawTextList=[]
for groupId in groups:
    for dirpath, dirnames, filenames in os.walk(rawPath+groupId+'/raw'):
        ## clean out non .txt files
        filenames = [filename for filename in filenames if ".txt" in filename]
        rawFileList = rawFileList + filenames
        for filename in filenames:   
            #Extract raw text and update for encoding issues            
            rawData=open(rawPath+groupId+'/raw/'+filename, 'r', encoding="utf-8", errors="ignore" ).read() #
            rawTextList.append(rawData)
            # store groupId
            rawGroupList.append(groupId)

# these three should be the same length
print(len(rawFileList))
print(len(rawTextList))
print(len(rawGroupList))
#
#print(rawGroupList)


3519
3519
3519


In [43]:
# make into data frame
d = {'groupName': rawGroupList, 'file': rawFileList, 'rawText':rawTextList}
capDF = pd.DataFrame(data=d)
print(capDF.head())

           file   groupName                                            rawText
0   12.html.txt  DorothyDay  \nThe Catholic Worker, May 1933, 4 (First Issu...
1   13.html.txt  DorothyDay  \nThe Catholic Worker, April 1934, 3-4.\nSumma...
2  142.html.txt  DorothyDay  \nThe Catholic Worker, January 1936, 4.\nSumma...
3  143.html.txt  DorothyDay  \nThe Catholic Worker, January 1936, 1-2.\nSum...
4  145.html.txt  DorothyDay  \nThe Catholic Worker, February 1938, 1-2.\nSu...


In [44]:
# add ranks

groupNameList=['WBC', 'PastorAnderson', 'NaumanKhan', 'DorothyDay', 'JohnPiper', 'Shepherd',
'Rabbinic', 'Unitarian', 'MehrBaba','NawDawg','SeaShepherds','IntegralYoga','Bahai','ISIS']
groupRankList=[1,2,3,4,4,4,6,7,8,4,2,7,6,1]

groupRankDF=pd.DataFrame([[groupNameList[i],groupRankList[i]] for i in range(len(groupNameList))],columns=['groupName','rank'])

capDF=capDF.merge(groupRankDF, on='groupName')
print(capDF.head())
print(capDF.tail())

           file   groupName  \
0   12.html.txt  DorothyDay   
1   13.html.txt  DorothyDay   
2  142.html.txt  DorothyDay   
3  143.html.txt  DorothyDay   
4  145.html.txt  DorothyDay   

                                             rawText  rank  
0  \nThe Catholic Worker, May 1933, 4 (First Issu...     4  
1  \nThe Catholic Worker, April 1934, 3-4.\nSumma...     4  
2  \nThe Catholic Worker, January 1936, 4.\nSumma...     4  
3  \nThe Catholic Worker, January 1936, 1-2.\nSum...     4  
4  \nThe Catholic Worker, February 1938, 1-2.\nSu...     4  
                                         file groupName  \
3514  WestboroBaptist_Sermon_20150823.pdf.txt       WBC   
3515  WestboroBaptist_Sermon_20150830.pdf.txt       WBC   
3516  WestboroBaptist_Sermon_20150906.pdf.txt       WBC   
3517  WestboroBaptist_Sermon_20150913.pdf.txt       WBC   
3518  WestboroBaptist_Sermon_20150920.pdf.txt       WBC   

                                                rawText  rank  
3514  "Likewise the Spirit a

## make the classes roughly equal

In [47]:
print(capDF['rank'].value_counts())
print(capDF['groupName'].value_counts())


4    2081
1     419
7     276
8     265
2     228
6     166
3      84
Name: rank, dtype: int64
DorothyDay        774
Shepherd          728
JohnPiper         579
WBC               419
Unitarian         276
MehrBaba          265
PastorAnderson    228
Rabbinic          166
NaumanKhan         84
Name: groupName, dtype: int64


In [93]:
groupsToCut = ['WBC', 'DorothyDay', 'JohnPiper', 'Shepherd']
#group = groupsToCut[0]
#print(group)
cuts = []
for group in groupsToCut:
    indAll = capDF[capDF['groupName']==group].index
    if group == 'WBC':
        cuts = cuts + random.sample(indAll.tolist(), int(float(len(indAll)) * .5))
    else:
        cuts = cuts + random.sample(indAll.tolist(), int(float(len(indAll)) * .8))

len(cuts)

1873

In [94]:
allIndexes = capDF.index
print(len(allIndexes))

NormIndexes = [x for x in allIndexes if x not in cuts]
print(len(NormIndexes))
print(len(allIndexes) - len(cuts)) # should be the same, just checking

3519
1646
1646


In [95]:
print(capDF.shape)
capNorm = capDF.loc[NormIndexes]
print(capNorm.shape)

(3519, 4)
(1646, 4)


In [96]:
print(capNorm['rank'].value_counts())
print(capNorm['groupName'].value_counts())


4    417
7    276
8    265
2    228
1    210
6    166
3     84
Name: rank, dtype: int64
Unitarian         276
MehrBaba          265
PastorAnderson    228
WBC               210
Rabbinic          166
DorothyDay        155
Shepherd          146
JohnPiper         116
NaumanKhan         84
Name: groupName, dtype: int64


## Run the cleaning function



In [103]:
start=time.time()
capNorm_x = capNorm.rawText.apply(clean_text, args = ('pos stem',))
print("training set took " + str(time.time() - start) + " seconds.")

capNorm_x



training set took 2664.0750579833984 seconds.


0       [The, Cathol, Worker, May, _NUM_NUM_NUM_NUM, _...
4       [The, Cathol, Worker, Februari, _NUM_NUM_NUM_N...
7       [The, Cathol, Worker, Februari, _NUM_NUM_NUM_N...
17      [The, Cathol, Worker, May, _NUM_NUM_NUM_NUM, S...
25      [The, Cathol, Worker, Novemb, _NUM_NUM_NUM_NUM...
34      [The, Cathol, Worker, May, _NUM_NUM_NUM_NUM, _...
38      [The, Cathol, Worker, Decemb, _NUM_NUM_NUM_NUM...
53      [The, Cathol, Worker, Decemb, _NUM_NUM_NUM_NUM...
56      [Chapter, _NUM, Summari, Consid, difficult, ta...
62      [Chapter, _NUM, Report, Summari, Describ, life...
65      [Chapter, _NUM_NUM, Peac, Summari, A, vivid, d...
67      [Chapter, _NUM_NUM, Wheat, Cockl, Summari, Ans...
70      [The, Cathol, Worker, May, _NUM_NUM_NUM_NUM, _...
72      [The, Cathol, Worker, June, _NUM_NUM_NUM_NUM, ...
102     [The, Cathol, Worker, Juli, August, _NUM_NUM_N...
104     [The, Cathol, Worker, March, _NUM_NUM_NUM_NUM,...
112     [The, Cathol, Worker, Septemb, _NUM_NUM_NUM_NU...
113     [The, 

In [104]:
#np.savetxt(path+'cap_train_x.csv', cap_train_x, delimiter=',')

#np.savetxt(path+'cap_test_x.csv', cap_test_x, delimiter=',')
type(capNorm_x)
#cap_train_x.to_csv(path+'cap_train_x.csv', index=False)
#cap_test_x.to_csv(path+'cap_test_x.csv', index=False)



pandas.core.series.Series

## Make TF-IDF: consider sparsity options

In [105]:
# Checking out the upper bound

swds = stopwords.words('english')

vect100 = TfidfVectorizer(analyzer = "word",input="file", ngram_range = (1,3),\
                       min_df = 0.2, stop_words = swds)

docs_new = [StringIO(str(x)) for x in capNorm_x]
cap_tf = vect100.fit_transform(docs_new).toarray()
print('at 100%, ' + str(cap_tf.shape[1]) + ' terms.')

vect99 = TfidfVectorizer(analyzer = "word",input="file", ngram_range = (1,3),\
                       min_df = 0.2, max_df = 0.99, stop_words = swds)

docs_new = [StringIO(str(x)) for x in capNorm_x]
cap_tf = vect99.fit_transform(docs_new).toarray()
print('at 99%, ' + str(cap_tf.shape[1]) + ' terms.')

vect90 = TfidfVectorizer(analyzer = "word",input="file", ngram_range = (1,3),\
                       min_df = 0.2, max_df = 0.9, stop_words = swds)

docs_new = [StringIO(str(x)) for x in capNorm_x]
cap_tf = vect90.fit_transform(docs_new).toarray()
print('at 90%, ' + str(cap_tf.shape[1]) + ' terms.')

vect80 = TfidfVectorizer(analyzer = "word",input="file", ngram_range = (1,3),\
                       min_df = 0.2, max_df = 0.8, stop_words = swds)

docs_new = [StringIO(str(x)) for x in capNorm_x]
cap_tf = vect80.fit_transform(docs_new).toarray()
print('at 80%, ' + str(cap_tf.shape[1]) + ' terms.')

vect50 = TfidfVectorizer(analyzer = "word",input="file", ngram_range = (1,3),\
                       min_df = 0.2, max_df = 0.5, stop_words = swds)

docs_new = [StringIO(str(x)) for x in capNorm_x]
cap_tf = vect50.fit_transform(docs_new).toarray()
print('at 50%, ' + str(cap_tf.shape[1]) + ' terms.')


at 100%, 958 terms.
at 99%, 935 terms.
at 90%, 889 terms.
at 80%, 854 terms.
at 50%, 665 terms.


In [106]:
# looking at the terms that are in more than 50% of documents
terms100 = vect100.get_feature_names()
terms99 = vect99.get_feature_names()
terms50 = vect50.get_feature_names()
#print(terms100)
#print(terms50)
print("not in 99")
print([term for term in terms100 if term not in terms99])
print("not in 50")
print([term for term in terms100 if term not in terms50])

not in 99
['_pos_a', '_pos_a _pos_n', '_pos_a _pos_n _pos_n', '_pos_n', '_pos_n _pos_a', '_pos_n _pos_a _pos_n', '_pos_n _pos_n', '_pos_n _pos_n _pos_a', '_pos_n _pos_n _pos_n', '_pos_n _pos_n _pos_unk', '_pos_n _pos_n _pos_v', '_pos_n _pos_unk', '_pos_n _pos_unk _pos_n', '_pos_n _pos_v', '_pos_n _pos_v _pos_n', '_pos_unk', '_pos_unk _pos_a', '_pos_unk _pos_n', '_pos_unk _pos_n _pos_n', '_pos_unk _pos_v', '_pos_v', '_pos_v _pos_n', '_pos_v _pos_n _pos_n']
not in 50
['_num', '_num _num', '_num_num', '_num_num _num_num', '_num_num_num', '_num_num_num_num', '_pos_a', '_pos_a _pos_a', '_pos_a _pos_a _pos_a', '_pos_a _pos_a _pos_n', '_pos_a _pos_a _pos_unk', '_pos_a _pos_a _pos_v', '_pos_a _pos_n', '_pos_a _pos_n _pos_a', '_pos_a _pos_n _pos_n', '_pos_a _pos_n _pos_unk', '_pos_a _pos_n _pos_v', '_pos_a _pos_unk', '_pos_a _pos_unk _pos_a', '_pos_a _pos_unk _pos_n', '_pos_a _pos_unk _pos_unk', '_pos_a _pos_unk _pos_v', '_pos_a _pos_v', '_pos_a _pos_v _pos_a', '_pos_a _pos_v _pos_n', '_pos_a _

In [148]:
threshs = [.05,.04,.03,.02, .01, .005, .001, .0001]

for thresh in threshs:
    vectT = TfidfVectorizer(analyzer = "word",input="file", ngram_range = (1,3),\
                       min_df = thresh, max_df = 0.8, stop_words = swds)

    docs_new = [StringIO(str(x)) for x in capNorm_x]
    capT_tf = vectT.fit_transform(docs_new).toarray()
    print('at '+ str(thresh) + ', ' + str(capT_tf.shape[1]) + ' terms.')
    
    

    

at 0.05, 3834 terms.
at 0.04, 4882 terms.
at 0.03, 6604 terms.
at 0.02, 10552 terms.
at 0.01, 23134 terms.
at 0.005, 53181 terms.
at 0.001, 535921 terms.
at 0.0001, 3940759 terms.


## Make TF-IDF

In [108]:


vect = TfidfVectorizer(analyzer = "word",input="file", ngram_range = (1,3),\
                   min_df = 0.03, stop_words = swds)

docs_new = [StringIO(str(x)) for x in capNorm_x]
tf = vect.fit_transform(docs_new).toarray()

print(tf.shape)




(1646, 6708)


In [109]:
tf[:10,:10]


array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [110]:
vocab = pd.Series(vect.get_feature_names())
print(len(vocab))
vocab



6708


0                _bq
1            _bq _bq
2             _bq _q
3            _bq ask
4         _bq believ
5           _bq bibl
6       _bq bibl say
7           _bq come
8           _bq even
9            _bq get
10          _bq give
11            _bq go
12           _bq god
13          _bq good
14           _bq got
15           _bq hey
16          _bq jesu
17          _bq know
18           _bq let
19          _bq like
20          _bq look
21          _bq make
22           _bq man
23          _bq mean
24          _bq need
25            _bq oh
26           _bq one
27         _bq peopl
28        _bq realli
29         _bq right
            ...     
6678        year one
6679       year year
6680           yearn
6681            yell
6682          yellow
6683       yesterday
6684             yet
6685           yield
6686            yoke
6687             yom
6688            york
6689       york citi
6690            youd
6691           youll
6692      youll find
6693       youll see
6694         

In [111]:
######### DELETE END ########

In [141]:
tfidf = pd.DataFrame(tf, columns = vocab)

In [142]:
print(tfidf.shape)
print(capNorm.shape)

(1646, 6708)
(1646, 4)


In [136]:
# resets the index so that we can concatenate them properly
capNormReset = capNorm.reset_index(drop=True)
print(capNormReset.head())

           file   groupName  \
0   12.html.txt  DorothyDay   
1  145.html.txt  DorothyDay   
2  148.html.txt  DorothyDay   
3  158.html.txt  DorothyDay   
4  166.html.txt  DorothyDay   

                                             rawText  rank  
0  \nThe Catholic Worker, May 1933, 4 (First Issu...     4  
1  \nThe Catholic Worker, February 1938, 1-2.\nSu...     4  
2  \nThe Catholic Worker, February 1943, 1,4.\nSu...     4  
3  \nThe Catholic Worker, May 1948.\nSummary: 16t...     4  
4  \nThe Catholic Worker, November 1949, 1,2,4.\n...     4  


In [144]:
# concatenate by columns
fullDFnorm = pd.concat([capNormReset[['file', 'groupName', 'rank']], fullDF], axis=1)
fullDFnorm.shape

(1646, 6711)

In [145]:
# preview
fullDFnorm.head()

Unnamed: 0,file,groupName,rank,_bq,_bq _bq,_bq _q,_bq ask,_bq believ,_bq bibl,_bq bibl say,...,young woman,younger,youth,youtub,youv,youv got,zeal,zealou,zero,zion
0,12.html.txt,DorothyDay,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,145.html.txt,DorothyDay,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,148.html.txt,DorothyDay,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,158.html.txt,DorothyDay,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,166.html.txt,DorothyDay,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002552,0.0,0.0,0.0


In [146]:
fullDFnorm.to_csv(path+'fullDFnorm.csv')

### This is the end of processing the data
If you jumped from the top, this is where you jump to:


# ...OR LOAD THE DATA FRAME FROM FILE
To load from file, uncomment this next block.
### NOTE: if you processed everything, DO NOT uncomment this next block, because it will overwrite what you just processed.

In [4]:
fullDFnorm = pd.read_csv(path+'fullDFnorm1.csv', index_col = 0)
fullDFnorm.head()
# # it should look like the sample from 4 cells above

Unnamed: 0,file,groupName,rank,_bq,_bq _bq,_bq _q,_bq ask,_bq believ,_bq bibl,_bq bibl say,...,young woman,younger,youth,youtub,youv,youv got,zeal,zealou,zero,zion
0,12.html.txt,DorothyDay,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,145.html.txt,DorothyDay,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,148.html.txt,DorothyDay,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,158.html.txt,DorothyDay,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,166.html.txt,DorothyDay,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002552,0.0,0.0,0.0


## Now split them into testing and training

In [5]:
# set the seed to be able to replicate later
random.seed(100)
# training set
trainIndex = random.sample(fullDFnorm.index.tolist(), int(float(fullDFnorm.shape[0]) * .7))
print(trainIndex[:10])
print('$$$$$$')
print(len(trainIndex))
# testing set
testIndex = [x for x in fullDFnorm.index.tolist() if x not in trainIndex]
print(len(testIndex))
# double check
print(str(len(trainIndex)) + " + " + str(len(testIndex)) + " = " + str(len(trainIndex) + len(testIndex)))
# should be the total number of observations

[298, 941, 931, 1578, 357, 1444, 804, 1499, 716, 887]
$$$$$$
1152
494
1152 + 494 = 1646


In [6]:
# make the split
capNorm_train = fullDFnorm.loc[trainIndex]
print(capNorm_train.shape)
capNorm_test = fullDFnorm.loc[testIndex]
print(capNorm_test.shape)

(1152, 6711)
(494, 6711)


## create the model matrices and response vectors

In [7]:
# model matrices
train_x = capNorm_train.ix[:,3:]
test_x = capNorm_test.ix[:,3:]


In [8]:
print(train_x.shape)
train_x.head()

(1152, 6708)


Unnamed: 0,_bq,_bq _bq,_bq _q,_bq ask,_bq believ,_bq bibl,_bq bibl say,_bq come,_bq even,_bq get,...,young woman,younger,youth,youtub,youv,youv got,zeal,zealou,zero,zion
298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
print(test_x.shape)
test_x.head()

(494, 6708)


Unnamed: 0,_bq,_bq _bq,_bq _q,_bq ask,_bq believ,_bq bibl,_bq bibl say,_bq come,_bq even,_bq get,...,young woman,younger,youth,youtub,youv,youv got,zeal,zealou,zero,zion
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001934,0.0,0.001238,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.006589,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# response vectors
train_y = capNorm_train.ix[:,2]
test_y = capNorm_test.ix[:,2]
# NOTE: we have to use .ix[:,2] instead of ['rank'] because there is a term column called 'rank' too (just bad luck)
print(train_y.describe())
print(test_y.describe())

count    1152.000000
mean        4.560764
std         2.435929
min         1.000000
25%         2.000000
50%         4.000000
75%         7.000000
max         8.000000
Name: rank, dtype: float64
count    494.000000
mean       4.817814
std        2.362487
min        1.000000
25%        3.000000
50%        4.000000
75%        7.000000
max        8.000000
Name: rank, dtype: float64


## SVC Model

In [11]:
## C = 1 model
start = time.time()

svmCap1=svm.SVC(C=1,kernel='rbf',probability = True, max_iter=100000)
svmCap1.fit(train_x, train_y)

end=time.time()
print("took " + str(end-start) + ' secs to train.')

print(svmCap1.score(test_x, test_y))


took 70.78559517860413 secs to train.
0.265182186235


In [12]:
yPredsvmCap1=svmCap1.predict(test_x)
print(np.percentile(yPredsvmCap1, [1,25,50,75,99]))
Counter(yPredsvmCap1)

[ 4.  4.  4.  4.  4.]


Counter({4: 494})

In [13]:
yPredsvmCapProb1=svmCap1.predict_proba(test_x)
print(yPredsvmCapProb1[:])

[[ 0.00408745  0.00857772  0.00115311 ...,  0.02932597  0.03517524
   0.07446796]
 [ 0.00784145  0.02167918  0.00388981 ...,  0.05077572  0.14266959
   0.17579002]
 [ 0.02784172  0.03790868  0.03026507 ...,  0.13353116  0.50157045
   0.16990389]
 ..., 
 [ 0.58473724  0.08248346  0.02317545 ...,  0.04068084  0.06091575
   0.06680258]
 [ 0.58873529  0.08065772  0.02801592 ...,  0.04958703  0.06186046
   0.0611081 ]
 [ 0.49160471  0.0635083   0.01791365 ...,  0.05307586  0.05191606
   0.04943709]]


In [14]:
yPredsvmCapMAN = []
for probs in yPredsvmCapProb1:
    print(probs)
    print(np.argmax(probs) + 1)
    print(np.amax(probs))
    yPredsvmCapMAN.append(np.argmax(probs) + 1)

[ 0.00408745  0.00857772  0.00115311  0.84721255  0.02932597  0.03517524
  0.07446796]
4
0.847212547526
[ 0.00784145  0.02167918  0.00388981  0.59735424  0.05077572  0.14266959
  0.17579002]
4
0.597354236649
[ 0.02784172  0.03790868  0.03026507  0.09897904  0.13353116  0.50157045
  0.16990389]
6
0.501570445677
[ 0.02787936  0.01371437  0.00621265  0.76612003  0.07290239  0.06259974
  0.05057146]
4
0.766120031182
[ 0.02333492  0.0358645   0.02065135  0.26067834  0.13588474  0.3754208
  0.14816535]
6
0.375420803152
[ 0.00581216  0.00110826  0.00213596  0.9583942   0.02727817  0.00325505
  0.0020162 ]
4
0.958394202917
[ 0.00673152  0.00589612  0.0035941   0.83889519  0.07885392  0.04167803
  0.02435112]
4
0.83889519488
[ 0.00363202  0.01451443  0.00159397  0.788919    0.03252072  0.05496361
  0.10385626]
4
0.788918999208
[ 0.01171104  0.0048181   0.00354891  0.84592345  0.07188648  0.04078393
  0.02132808]
4
0.845923448895
[  8.85599237e-04   5.24891324e-03   1.53880274e-04   9.70011182e-

In [15]:
#print(yPredsvmCapMAN)
print(np.percentile(yPredsvmCapMAN, [1,25,50,75,99]))
Counter(yPredsvmCapMAN)

[ 1.  4.  6.  6.  7.]


Counter({1: 40, 2: 39, 3: 1, 4: 143, 6: 265, 7: 6})

In [16]:
samples = random.sample(range(0, test_y.shape[0]), 10)
for i in samples:
    print('$$$$ observation ' + str(i))
    print(yPredsvmCapProb1[i])
    print(yPredsvmCap1[i])
    print(yPredsvmCapMAN[i])
    print(test_y.iloc[i])

$$$$ observation 398
[ 0.02485462  0.04659952  0.01606724  0.09761608  0.08185594  0.48797352
  0.24503309]
4
6
7
$$$$ observation 96
[ 0.01858135  0.0442485   0.00914153  0.04008998  0.02723433  0.46705404
  0.39365027]
4
6
8
$$$$ observation 304
[ 0.04541841  0.04667468  0.02068045  0.14512439  0.11982963  0.46073298
  0.16153947]
4
6
6
$$$$ observation 54
[ 0.03746334  0.04454859  0.00992206  0.34974801  0.04350252  0.17601462
  0.33880087]
4
4
4
$$$$ observation 115
[ 0.01509214  0.036369    0.01363666  0.01792242  0.02626026  0.48777014
  0.40294937]
4
6
8
$$$$ observation 26
[ 0.0054599   0.00259167  0.00244289  0.88392119  0.0582237   0.03602197
  0.01133868]
4
4
4
$$$$ observation 245
[ 0.09642979  0.31902658  0.02034846  0.17249096  0.08399246  0.16710369
  0.14060806]
4
2
2
$$$$ observation 23
[ 0.00480936  0.0042352   0.0043922   0.87080005  0.08142697  0.0222103
  0.01212592]
4
4
4
$$$$ observation 486
[ 0.2443397   0.06895271  0.242181    0.16935017  0.09090671  0.11947268

## SVR model

In [17]:
## C = 1 model
start = time.time()

svmCapR1=svm.SVR(C=1,kernel='rbf', max_iter=100000)
svmCapR1.fit(train_x, train_y)

end=time.time()
print("took " + str(end-start) + ' secs to train.')

print(svmCapR1.score(test_x, test_y))

took 8.663609027862549 secs to train.
-0.0923306864963


In [18]:
yPredsvmCapR1=svmCapR1.predict(test_x)
print(np.percentile(yPredsvmCapR1, [1,25,50,75,99]))

[ 4.09899855  4.09968176  4.09989874  4.10007909  4.10049449]


## Look at results

In [19]:
# copy the identifiers from the test DF
resultsDF = capNorm_test.ix[:,:3].copy()
# add in the predictions
resultsDF = resultsDF.assign(SVMMAN = yPredsvmCapMAN)
#
resultsDF = resultsDF.assign(SVM1 = yPredsvmCap1)
resultsDF = resultsDF.assign(SVR1 = yPredsvmCapR1)


In [20]:
resultsDF.ix[random.sample(resultsDF.index.tolist(), 10)]
#print(yPredsvmCapR1)

Unnamed: 0,file,groupName,rank,SVMMAN,SVM1,SVR1
1028,answer-me-quickly-lord-steve-shepherd-sermon-o...,Shepherd,4,4,4,4.099753
765,transcript_nwo_babylon.html.txt,PastorAnderson,2,4,4,4.099785
1643,WestboroBaptist_Sermon_20150809.pdf.txt,WBC,1,1,4,4.099467
830,transcript_what_will_ye_do.html.txt,PastorAnderson,2,6,4,4.099729
1373,"2011-09-18 Hospitality as a Mission, Rev. Mark...",Unitarian,7,6,4,4.100093
1404,"2013-05-05 Can We Really Agree to Disagree, Re...",Unitarian,7,6,4,4.100105
1014,a-blessed-attitude-steve-shepherd-sermon-on-se...,Shepherd,4,4,4,4.099879
1201,"2003-01-05 A Simple Faith, Rev. Mark Belletini...",Unitarian,7,6,4,4.100034
1376,"2011-11-20 Thanksgiving Bread of Paradise, Rev...",Unitarian,7,6,4,4.100107
358,journal.1.12.02.txt,MehrBaba,8,6,4,4.099899


In [21]:
#
resultsDF.describe()

Unnamed: 0,rank,SVMMAN,SVM1,SVR1
count,494.0,494.0,494.0,494.0
mean,4.817814,4.706478,4.0,4.099875
std,2.362487,1.675766,0.0,0.000309
min,1.0,1.0,4.0,4.098838
25%,3.0,4.0,4.0,4.099682
50%,4.0,6.0,4.0,4.099899
75%,7.0,6.0,4.0,4.100079
max,8.0,7.0,4.0,4.100717


## Look at accuracy in the Capstone test (within 1 of correct rank)

In [22]:
svmAccuracy=float(len([i for i in range(len(yPredsvmCapMAN)) if abs(test_y.iloc[i]-yPredsvmCapMAN[i])<1])/float(len(yPredsvmCapMAN)))
print(svmAccuracy)

0.5060728744939271
