In [1]:
#Standard packages
import pandas as pd
import numpy as np

# Scikit Learn
from sklearn import preprocessing
from sklearn import tree
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer 
from sklearn.model_selection import train_test_split, KFold

#Natural Language Toolkit
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize

#Plotting
from matplotlib import pyplot as plt
import seaborn as sns
# Allow plots in Notebook
%matplotlib inline

In [2]:
corps = pd.read_csv("D:/Documents/Data/case_study_data_tiny.csv")
#df = corps[['complaint_id','product_group','text']]
df = corps[['product_group','text']]
X = df['text']
y = df['product_group']
X.head(5)

0    Two private loans have with them very discharg...
1    attach a letter dated explaining dropped the v...
2    Please see attached Complaint Number against c...
3    feel as though 've been subjected to predatory...
4    a veteran living on social security and cosign...
Name: text, dtype: object

In [3]:
X= X.str.lower()
X.head(10)

0    two private loans have with them very discharg...
1    attach a letter dated explaining dropped the v...
2    please see attached complaint number against c...
3    feel as though 've been subjected to predatory...
4    a veteran living on social security and cosign...
5    problem this company has been transfered a deb...
6    have filed a complaint before case was hit wit...
7    this is about the three major credit agencies ...
8    the office of the attorney general office is r...
9    to whom it may concern am in need of some help...
Name: text, dtype: object

In [4]:
import pandas as pd
import nltk
X = X.apply(word_tokenize)

In [5]:
X.head(10)

0    [two, private, loans, have, with, them, very, ...
1    [attach, a, letter, dated, explaining, dropped...
2    [please, see, attached, complaint, number, aga...
3    [feel, as, though, 've, been, subjected, to, p...
4    [a, veteran, living, on, social, security, and...
5    [problem, this, company, has, been, transfered...
6    [have, filed, a, complaint, before, case, was,...
7    [this, is, about, the, three, major, credit, a...
8    [the, office, of, the, attorney, general, offi...
9    [to, whom, it, may, concern, am, in, need, of,...
Name: text, dtype: object

In [6]:
from nltk.corpus import stopwords
stopwords_en = set(stopwords.words('english')) # Set checking is faster in Python than list.
print(stopwords_en)

{'isn', 'as', 'do', 'd', 're', 'yourself', 'doesn', 'on', 'am', "haven't", "you've", 'all', 'an', 'this', 'where', 'wouldn', 'which', "aren't", 'from', 'below', 'why', "that'll", 'nor', 'myself', 'have', 'haven', 'what', 'shan', 'wasn', 'couldn', 'or', 'the', 'again', "mightn't", 's', 'same', 'o', 'being', 'm', 'off', 'herself', 'down', 'into', 'both', 'themselves', 'should', 'here', "hasn't", 'yours', 'them', 'll', 'does', 'under', "didn't", 'was', 'how', 'not', 'up', 'ours', 'they', 'more', 'doing', 'but', 'at', 'y', "hadn't", 'himself', 'aren', 'him', 'won', "you're", "isn't", 'that', 'very', "she's", 'their', 'there', 'you', 'her', 'now', 'ain', 'ma', 'during', 'once', 'than', "couldn't", 'these', 'he', 'before', 'ourselves', 'are', 'such', 'will', 'too', "it's", 'my', "you'll", 'weren', 'only', "you'd", 'having', 'because', "shan't", 'who', 'is', 'while', "doesn't", 'be', 'mustn', 'over', 'own', 'when', 'through', 'don', 'and', 'his', 'our', 'it', 'shouldn', "needn't", 'has', "mus

In [7]:
from string import punctuation
# It's a string so we have to them into a set type
print('From string.punctuation:', type(punctuation), punctuation)

From string.punctuation: <class 'str'> !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [8]:
stopwords_en_withpunct = stopwords_en.union(set(punctuation))
print(stopwords_en_withpunct)

{'isn', 'as', ';', 'do', 'd', 're', 'yourself', 'doesn', "'", '(', '-', 'on', 'am', "haven't", "you've", 'all', 'an', 'this', 'where', '*', '<', '\\', ']', 'wouldn', 'which', "aren't", 'from', 'below', 'why', '>', "that'll", '@', '[', 'nor', 'myself', 'have', 'haven', '=', 'what', 'shan', 'wasn', '"', 'couldn', 'or', 'the', 'again', "mightn't", 's', 'same', 'o', 'being', 'm', '_', 'off', 'herself', 'down', 'into', 'both', 'themselves', 'should', 'here', "hasn't", '+', 'yours', '~', 'them', 'll', 'does', 'under', "didn't", 'was', 'how', 'not', 'up', 'ours', 'they', 'more', 'doing', 'but', 'at', 'y', "hadn't", 'himself', 'aren', 'him', 'won', "you're", "isn't", ':', '!', 'that', '}', 'very', '$', "she's", 'their', 'there', '#', 'you', ')', 'her', 'now', 'ain', 'ma', 'during', '&', 'once', 'than', "couldn't", 'these', 'he', 'before', 'ourselves', 'are', 'such', 'will', 'too', "it's", 'my', "you'll", 'weren', 'only', "you'd", 'having', 'because', "shan't", 'who', 'is', 'while', "doesn't", 

In [9]:
stop = stopwords_en_withpunct
X = X.apply(lambda x: [item for item in x if item not in stop])
X.head(5)

0    [two, private, loans, discharged, chapter, ban...
1    [attach, letter, dated, explaining, dropped, v...
2    [please, see, attached, complaint, number, cop...
3    [feel, though, 've, subjected, predatory, loan...
4    [veteran, living, social, security, cosigned, ...
Name: text, dtype: object

In [10]:
from nltk.stem.snowball import SnowballStemmer
# Use English stemmer.
stemmer = SnowballStemmer("english")
X = X.apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.
#df = df.drop(columns=['unstemmed']) # Get rid of the unstemmed column.
#df # Print dataframe.

In [11]:
X.head(10)

0    [two, privat, loan, discharg, chapter, bankrup...
1    [attach, letter, date, explain, drop, vehicl, ...
2    [pleas, see, attach, complaint, number, copi, ...
3    [feel, though, ve, subject, predatori, loan, r...
4    [veteran, live, social, secur, cosign, loan, d...
5    [problem, compani, transfer, debt, unabl, prov...
6    [file, complaint, case, hit, doubl, whammi, ye...
7    [three, major, credit, agenc, error, get, fix,...
8    [offic, attorney, general, offic, report, owe,...
9    [may, concern, need, help, privat, loan, ae, 1...
Name: text, dtype: object

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [13]:
len(X), len(y)

(49999, 49999)

In [14]:
# y = pd.DataFrame(y)
# X = pd.DataFrame(X)

In [42]:
input_list = X.tolist()

def find_bigrams(input_list):
    bigram_list = []
    for i in range(len(input_list)-1):
        bigram_list.append((input_list[i], input_list[i+1]))
    return bigram_list

mapped = find_bigrams(input_list)
print(mapped)

IndexError: list index out of range

In [40]:
len(input_list)+1

2

In [None]:
df = corps
df = df[pd.notnull(df['product_group'])]
df.columns = ['product_group','text']
df['category_id'] = df['product_group'].factorize()[0]
category_id_df = df[['product_group', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'product_group']].values)
df.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [None]:
X_train.head(5)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

CountVectorizer(
    input='corps',            #the input text data or corpus
    encoding='utf-8',         #encoding is used to decode
    decode_error='strict',    #means a UnicodeDecodeError will be raised (other values are ignore and replace) 
    strip_accents='ascii',    #removes accents and perform other character normalization (ascii is the fastest)
    lowercase=True,           #converts all tect to lower case
    tokenizer=word_tokenize,  #default value is None (only applies if analyzer == 'word')
    stop_words='english',     #default value is None (only applies if analyzer == 'word')
    token_pattern=r'\b\w+\b', #string, denoting what constitutes a “token” (only used if analyzer == 'word')
    ngram_range=(1, 3),       #will yield unigrams, bigrams, and trigrams
    analyzer='word',          #feature makeup {‘string’, ‘word’, ‘char’, ‘char_wb’} or callable
    max_df=1.0,               #ignore terms that have a frequency higher than this threshold
    min_df=1,                 #ignore terms that have a frequency lower than this threshold
    max_features = 20,        #build a vocabulary size N or None
    binary=False,             #if True, all non zero counts are set to 1
    dtype= np.int64           #type of the matrix returned by fit_transform() or transform()
)

In [None]:
cnt_vec = CountVectorizer(X_train)

In [None]:
features = cnt_vec.fit_transform(pd.DataFrame(X_train)) #CORRECT TRANSFORM
#labels = pd.DataFrame(y_train) #CORRECT LABEL

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

TfidfTransformer(
    norm ='l2',          #each output row will have unit norm, either: 'l2', 'l1', or None
    use_idf=True,        #enable inverse-document-frequency reweighting
    smooth_idf=True,     #smooth idf weights by adding one to document frequencies to prevent divison by zero
    sublinear_tf=False,  #apply sublinear tf scaling, i.e. replace tf with 1 + log(tf)
)

In [None]:
#DO NOT CHANGE#
tfidf= TfidfVectorizer(sublinear_tf=True,  norm='l2', encoding='latin-1', ngram_range=(1, 3), stop_words='english')
#features = tfidf.fit_transform(X_train) #CORRECT TRANSFORM
#labels = y_train #CORRECT LABEL

In [None]:
features = tfidf.fit_transform(pd.DataFrame(X_train)) #CORRECT TRANSFORM
labels = pd.DataFrame(y_train) #CORRECT LABEL

In [None]:
features = tfidf.inverse_transform(X_train)

In [None]:
print(features)

In [None]:
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for product_group, category_id in sorted(id_to_category.items()):
  features_chi2 = chi2(features,labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  trigrams = [v for v in feature_names if len(v.split(' ')) == 3]
  print("# '{}':".format(category_id))
  print(".   Most correlated unigrams:\n.     {}".format('\n.     '.join(unigrams[-N:])))
  print(".   Most correlated bigrams:\n.     {}".format('\n.     '.join(bigrams[-N:])))
  print(".   Most correlated trigrams:\n.     {}".format('\n.     '.join(trigrams[-N:])))