In [1]:
#data preprocessing
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
%matplotlib inline

np.random.seed(42)


In [2]:
IMAGES_PATH=os.path.join(os.getcwd(),"static/")
if not os.path.exists(IMAGES_PATH):
        os.makedirs(IMAGES_PATH)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [3]:
data_file=os.path.join(os.getcwd(),"data/shuffled-full-set-hashed.csv")
dataframe_all = pd.read_csv(data_file, sep=",")
dataframe_all.columns = ["document_label", "word_values"]

In [8]:
display(dataframe_all.head())

Unnamed: 0,document_label,word_values
0,RETURNED CHECK,a3b334c6eefd be95012ebf2b 41d67080e078 ff1c26e...
1,BILL,586242498a88 9ccf259ca087 54709b24b45f 6bf9c0c...
2,BILL,cd50e861f48b 6ca2dd348663 d38820625542 f077614...
3,BILL,9db5536263d8 1c303d15eb65 3f89b4673455 b73e657...
4,POLICY CHANGE,10e45001c2f2 6a01047db3ab 6a8e3499dab9 97b6014...


In [9]:
dataframe_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62203 entries, 0 to 62202
Data columns (total 2 columns):
document_label    62203 non-null object
word_values       62158 non-null object
dtypes: object(2)
memory usage: 972.0+ KB


In [10]:
dataframe_all.describe()

Unnamed: 0,document_label,word_values
count,62203,62158
unique,14,60175
top,BILL,bf064c332aa1 079935e500e5 1a4dd36c6de0 7efa289...
freq,18968,11


In [11]:
dataframe_all["document_label"].value_counts()

BILL                       18968
POLICY CHANGE              10627
CANCELLATION NOTICE         9731
BINDER                      8973
DELETION OF INTEREST        4825
REINSTATEMENT NOTICE        4368
DECLARATION                  968
CHANGE ENDORSEMENT           889
RETURNED CHECK               749
EXPIRATION NOTICE            734
NON-RENEWAL NOTICE           624
BILL BINDER                  289
INTENT TO CANCEL NOTICE      229
APPLICATION                  229
Name: document_label, dtype: int64

In [169]:
# Finding the number of rows that have Nans
counter_nan = dataframe_all.isnull().sum()
counter_without_nan = counter_nan[counter_nan==0]
print counter_nan

document_label     0
word_values       45
dtype: int64


In [4]:
# Dropping the documents which don't have word values
dataframe_all=dataframe_all.dropna(subset = ['word_values'])

In [171]:
dataframe_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62158 entries, 0 to 62202
Data columns (total 2 columns):
document_label    62158 non-null object
word_values       62158 non-null object
dtypes: object(2)
memory usage: 1.4+ MB


In [172]:
documents=dataframe_all.ix[:,-1].values
labels=dataframe_all.ix[:,0].values

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [62]:
import collections
tokens=[]
for doc in documents:
    tokens+= doc.split(" ")
vocab=collections.Counter(tokens)

In [86]:
vocab_size=len(vocab)
print "Vocabulary Size : ",vocab_size

Vocabulary Size :  1037929


In [173]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataframe_all['document_label']=label_encoder.fit_transform(dataframe_all['document_label'])


In [174]:
display(dataframe_all.head())

Unnamed: 0,document_label,word_values
0,13,a3b334c6eefd be95012ebf2b 41d67080e078 ff1c26e...
1,1,586242498a88 9ccf259ca087 54709b24b45f 6bf9c0c...
2,1,cd50e861f48b 6ca2dd348663 d38820625542 f077614...
3,1,9db5536263d8 1c303d15eb65 3f89b4673455 b73e657...
4,11,10e45001c2f2 6a01047db3ab 6a8e3499dab9 97b6014...


In [136]:
list(label_encoder.inverse_transform(range(14)))

['APPLICATION',
 'BILL',
 'BILL BINDER',
 'BINDER',
 'CANCELLATION NOTICE',
 'CHANGE ENDORSEMENT',
 'DECLARATION',
 'DELETION OF INTEREST',
 'EXPIRATION NOTICE',
 'INTENT TO CANCEL NOTICE',
 'NON-RENEWAL NOTICE',
 'POLICY CHANGE',
 'REINSTATEMENT NOTICE',
 'RETURNED CHECK']

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
for train_index, test_index in split.split(dataframe_all['word_values'], dataframe_all['document_label']):
    strat_train_set = dataframe_all.loc[train_index]
    strat_test_set = dataframe_all.loc[test_index]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


In [176]:
strat_train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55942 entries, 61514 to 5227
Data columns (total 2 columns):
document_label    55904 non-null float64
word_values       55904 non-null object
dtypes: float64(1), object(1)
memory usage: 1.3+ MB


In [177]:
strat_test_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6216 entries, 54360 to 35178
Data columns (total 2 columns):
document_label    6209 non-null float64
word_values       6209 non-null object
dtypes: float64(1), object(1)
memory usage: 145.7+ KB


In [178]:
display(strat_train_set.head())

Unnamed: 0,document_label,word_values
61514,12.0,36e7aa72ffe1 35341b0d3b35 5e8453ac3bc1 d8afd84...
43031,4.0,f7975b31c697 37b538ac615a 3114e0d45526 eeb86a6...
30575,4.0,9477a55e0012 586242498a88 eb51798a89e1 2aa073f...
34373,3.0,9a06698b17e0 f0666bdbc8a5 3581a9cce110 4dfbcc2...
512,8.0,f7ae6f8257da 8b8058d7133b 40794f5353e0 2ed97f4...


In [183]:
strat_train_set["document_label"].value_counts()

1.0     17009
11.0     9555
4.0      8745
3.0      8070
7.0      4329
12.0     3924
6.0       881
5.0       813
13.0      675
8.0       670
10.0      560
2.0       259
9.0       215
0.0       199
Name: document_label, dtype: int64

In [6]:
strat_train_set=strat_train_set.dropna(subset = ['word_values'])
strat_test_set=strat_test_set.dropna(subset = ['word_values'])

In [7]:
x_train, y_train=strat_train_set['word_values'], strat_train_set['document_label']
x_test, y_test=strat_test_set['word_values'], strat_test_set['document_label']

In [264]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF 

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=vocab_size)
tfidf = tfidf_vectorizer.fit_transform(x_train)
#tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf_test = tfidf_vectorizer.transform(x_test)

In [194]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer().fit(tfidf)
x_train = tf_transformer.transform(tfidf)

In [265]:
tfidf_test


<6209x235496 sparse matrix of type '<type 'numpy.float64'>'
	with 1116376 stored elements in Compressed Sparse Row format>

In [266]:
tfidf

<55904x235496 sparse matrix of type '<type 'numpy.float64'>'
	with 10373395 stored elements in Compressed Sparse Row format>

In [None]:
tf_transformer_test = TfidfTransformer().fit(tfidf)
x_test = tf_transformer_test.transform(tfidf)

In [241]:
x_test

<55904x235496 sparse matrix of type '<type 'numpy.float64'>'
	with 10373395 stored elements in Compressed Sparse Row format>

In [247]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
label_encoder = LabelEncoder()
# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

class LabelEncoderPipelineFriendly(LabelEncoder):
    def fit(self, X, y=None):
        """this would allow us to fit the model based on the X input."""
        super(LabelEncoderPipelineFriendly, self).fit(X)
    def transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).transform(X)

    def fit_transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).fit(X).transform(X)

class ArrayCaster(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, data):
        return np.transpose(np.matrix(data))


num_pipeline = Pipeline([
        ('selector', DataFrameSelector('word_values')),
        ('vect', TfidfVectorizer(max_df=0.95, min_df=2, max_features=1037929)),
        ('tfidf', TfidfTransformer()),
    ])

# cat_pipeline = Pipeline([
#         ('selector', DataFrameSelector('document_label')),
#         ('encoder', LabelEncoderPipelineFriendly()),
#     ('caster', ArrayCaster()),
#     ]) 

from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
#          ("cat_pipeline", cat_pipeline),
    ])


In [248]:
prep=full_pipeline.fit_transform(strat_train_set)

In [249]:
test_prep=full_pipeline.fit_transform(strat_test_set)

In [250]:
test_prep

<6209x174273 sparse matrix of type '<type 'numpy.float64'>'
	with 1215111 stored elements in Compressed Sparse Row format>

In [251]:
prep

<55904x960935 sparse matrix of type '<type 'numpy.float64'>'
	with 11098834 stored elements in Compressed Sparse Row format>

In [1]:
labels=['APPLICATION',
 'BILL',
 'BILL BINDER',
 'BINDER',
 'CANCELLATION NOTICE',
 'CHANGE ENDORSEMENT',
 'DECLARATION',
 'DELETION OF INTEREST',
 'EXPIRATION NOTICE',
 'INTENT TO CANCEL NOTICE',
 'NON-RENEWAL NOTICE',
 'POLICY CHANGE',
 'REINSTATEMENT NOTICE',
 'RETURNED CHECK']

label_mapper= {k:v for k,v in enumerate(labels)}

In [2]:
label_mapper

{0: 'APPLICATION',
 1: 'BILL',
 2: 'BILL BINDER',
 3: 'BINDER',
 4: 'CANCELLATION NOTICE',
 5: 'CHANGE ENDORSEMENT',
 6: 'DECLARATION',
 7: 'DELETION OF INTEREST',
 8: 'EXPIRATION NOTICE',
 9: 'INTENT TO CANCEL NOTICE',
 10: 'NON-RENEWAL NOTICE',
 11: 'POLICY CHANGE',
 12: 'REINSTATEMENT NOTICE',
 13: 'RETURNED CHECK'}