In [3]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib as plt
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Import data
dta = pd.read_csv("D:/Documents/Data/case_study_data_copy.csv")

# Choose columns to be used
corps = dta[['product_group','text']]

# Set X and y columns
X = corps.text
y = corps.product_group
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

# Transform charcater labels ti numbers and back again
df = corps
df = df[pd.notnull(df['product_group'])]
df.columns = ['product_group','text']
df['category_id'] = df['product_group'].factorize()[0]
category_id_df = df[['product_group', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'product_group']].values)
df.head()

# Use TfidfVectorizer to tokenize the text, remove stopwords, convert to lowercase, build vocabulary, etc.
from sklearn.feature_extraction.text import TfidfVectorizer
TfidfVectorizer(
    input='corps',            #the input text data or corpus
    encoding='utf-8',         #encoding is used to decode
    decode_error='strict',    #means a UnicodeDecodeError will be raised (other values are ignore and replace) 
    strip_accents='ascii',    #removes accents and perform other character normalization (ascii is the fastest)
    lowercase=True,           #converts all tect to lower case
    tokenizer=word_tokenize,  #default value is None (only applies if analyzer == 'word')
    stop_words='english',     #default value is None (only applies if analyzer == 'word')
    token_pattern=r'\b\w+\b', #string, denoting what constitutes a “token” (only used if analyzer == 'word')
    ngram_range=(1, 3),       #will yield unigrams, bigrams, and trigrams
    analyzer='word',          #feature makeup {‘string’, ‘word’, ‘char’, ‘char_wb’} or callable
    max_df=1.0,               #ignore terms that have a frequency higher than this threshold
    min_df=1,                 #ignore terms that have a frequency lower than this threshold
    max_features = 20,        #build a vocabulary size N or None
    binary=False,             #if True, all non zero counts are set to 1
    dtype= np.int64,          #type of the matrix returned by fit_transform() or transform()
    norm='l2',                #each output row will have unit norm, either: 'l2', 'l1', or None
    use_idf=True,             #enable inverse-document-frequency reweighting
    smooth_idf=True,          #smooth idf weights by adding one to document frequencies to prevent divison by zero
    sublinear_tf=False,       #apply sublinear tf scaling, i.e. replace tf with 1 + log(tf)
)

tfidf= TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 3), stop_words='english')
features = tfidf.fit_transform(X_train) #CORRECT TRANSFORM
labels = y_train #CORRECT LABEL

# Extract N-grams: unigrams, bigrams, and trigrams
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for product_group, category_id in sorted(id_to_category.items()):
  features_chi2 = chi2(features,labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  trigrams = [v for v in feature_names if len(v.split(' ')) == 3]
  print("# '{}':".format(category_id))
  print(".   Most correlated unigrams:\n.     {}".format('\n.     '.join(unigrams[-N:])))
  print(".   Most correlated bigrams:\n.     {}".format('\n.     '.join(bigrams[-N:])))
  print(".   Most correlated trigrams:\n.     {}".format('\n.     '.join(trigrams[-N:])))

# 'bank_service':
.   Most correlated unigrams:
.     deposit
.     overdraft
.   Most correlated bigrams:
.     overdraft fees
.     checking account
.   Most correlated trigrams:
.     charged overdraft fees
.     opened checking account
# 'credit_card':
.   Most correlated unigrams:
.     express
.     card
.   Most correlated bigrams:
.     american express
.     credit card
.   Most correlated trigrams:
.     credit card account
.     credit card company
# 'credit_reporting':
.   Most correlated unigrams:
.     experian
.     equifax
.   Most correlated bigrams:
.     credit file
.     credit report
.   Most correlated trigrams:
.     mistakes appear report
.     appear report understanding
# 'debt_collection':
.   Most correlated unigrams:
.     collection
.     debt
.   Most correlated bigrams:
.     collect debt
.     collection agency
.   Most correlated trigrams:
.     attempting collect debt
.     trying collect debt
# 'loan':
.   Most correlated unigrams:
.     loans
.     