# Text Classification

In [None]:
from IPython.display import Image 
Image(filename='text-analysis.jpg')

In [2]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [3]:
data_set = pd.read_csv('/home/pybokeh/Downloads/FnS_temp.csv', usecols=[4,7,8])

In [4]:
data_set.head(15)

Unnamed: 0,SHRT_PART_NO,ORIG_CUST_CNTN_TXT,TARGET
0,1473,DURING LAST SERVICE INSPECTION TECH FOUND LEFT...,0
1,4712,CLIENT STATES THERE IS DAMAGE UNDER FRONT BUMP...,0
2,4712,CLIENT STATES THERE IS DAMAGE UNDER FRONT BUMP...,1
3,4715,CUSTOMER STATES REAR RIGHT BUMPER IS OUT OF AL...,1
4,4715,PER DPSM SUBLET LR QUARTER TO BODY SHOP,1
5,4814,CLIENT STATES RATTLE FROM PASSENGER B-PILLAR A...,0
6,4816,AUTH 119B - GENERAL MANAGER STATES PASSENGER S...,0
7,15400,GOODWILL CAR RAMPS FOR CLIENT PER ACURA,0
8,15400,LA PREP LOWER SEAT COVERS PER BEN ARIAS,0
9,15400,AUTH`D BY DPSM AND TECH LINE REF #TLC4000852. ...,1


### Create the 2 features variables and the one target variable

In [5]:
partno = data_set.SHRT_PART_NO.values
complaints = data_set.ORIG_CUST_CNTN_TXT.values
target = data_set.TARGET.values

### ```partno``` is a categorical variable consisting of string value, so need to do Label Encoding on it

In [6]:
enc_label = LabelEncoder()
X_train_partno_labelencoded = enc_label.fit_transform(partno)

In [7]:
X_train_partno_labelencoded

array([  0,   1,   1,   2,   2,   3,   4,   5,   5,   5,   6,   7,   8,
         9,  10,  11,  12,  13,  14,  14,  15,  15,  15,  15,  15,  15,
        15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  16,
        17,  18,  18,  18,  18,  18,  19,  20,  20,  21,  22,  22,  22,
        22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,
        22,  22,  23,  24,  25,  25,  26,  27,  28,  28,  28,  28,  29,
        30,  31,  32,  33,  34,  35,  36,  36,  36,  36,  36,  36,  36,
        37,  38,  38,  39,  39,  40,  41,  41,  41,  41,  41,  42,  42,
        42,  42,  42,  43,  44,  45,  45,  46,  46,  46,  47,  47,  48,
        48,  48,  48,  48,  49,  50,  51,  52,  53,  54,  54,  54,  54,
        54,  54,  55,  56,  57,  57,  57,  58,  59,  60,  61,  61,  62,
        62,  62,  63,  64,  65,  66,  66,  67,  68,  69,  70,  71,  71,
        71,  71,  71,  72,  73,  74,  75,  76,  77,  78,  79,  79,  80,
        80,  81,  81,  82,  82,  83,  84,  85,  85,  85,  86,  8

In [8]:
X_train_partno_labelencoded.shape

(301,)

### Now we can perform one-hot encoding on the label encoded data

In [9]:
enc_onehot = OneHotEncoder()
X_train_partno_onehot = enc_onehot.fit_transform(X_train_partno_labelencoded)



### Got yelled at for using a 1-D array, so converting it to 2-D:

In [10]:
X_train_partno_onehot = enc_onehot.fit_transform(X_train_partno_labelencoded.reshape(-1,1))

In [11]:
X_train_partno_onehot

<301x153 sparse matrix of type '<class 'numpy.float64'>'
	with 301 stored elements in Compressed Sparse Row format>

In [12]:
X_train_partno_onehot.shape

(301, 153)

### ```complaints``` is a feature column containing string values, but it is not categorical since it is not limited to a known number of values.  So in this case, to my understanding, I would tranform with CountVectorizer and then TfidfTransformer per this scikit-learn [example](http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html) on working with text data

### ```Vectorize``` the complaints training data with ```CountVectorizer```

In [13]:
count_vect = CountVectorizer()
X_train_complaint_counts = count_vect.fit_transform(complaints)
X_train_complaint_counts.shape

(301, 1141)

### ```Transform``` the complaints training data with ```tfidf```

In [14]:
tfidf_transformer = TfidfTransformer()
X_train_complaint_tfidf = tfidf_transformer.fit_transform(X_train_complaint_counts)
X_train_complaint_tfidf.shape

(301, 1141)

In [15]:
type(X_train_complaint_tfidf)

scipy.sparse.csr.csr_matrix

### Combine/concatenate the partno feature with the complaint feature data along the column axis or horizontally

In [16]:
X_train_combined_tfidf = sparse.hstack((X_train_partno_onehot, X_train_complaint_tfidf), format='csr')

In [17]:
X_train_combined_tfidf

<301x1294 sparse matrix of type '<class 'numpy.float64'>'
	with 4451 stored elements in Compressed Sparse Row format>

### Train to a model (Multinomial Naive Bayes)

In [18]:
clf = MultinomialNB().fit(X_train_combined_tfidf, target)

### ```Predict``` with test part # and complaint

In [19]:
part_test = np.array(['19301'])
complaint_test = np.array(['CLIENT STATES THE CHECK EMISSIONS LIGHT IS ON'])

X_new_part_labelencoded = enc_label.fit_transform(part_test)
enc_onehot_new = OneHotEncoder(n_values=153)
X_new_part_onehot = enc_onehot_new.fit_transform(X_new_part_labelencoded.reshape(-1,1))

X_new_complaint_counts = count_vect.transform(complaint_test)
X_new_complaint_tfidf = tfidf_transformer.transform(X_new_complaint_counts)

X_new_combined_tfidf = sparse.hstack((X_new_part_onehot, X_new_complaint_tfidf), format='csr')

predicted = clf.predict(X_new_combined_tfidf)

In [20]:
predicted

array([0])