In [None]:
# pip install pymssql
import pymssql
import pandas as pd

pd.options.display.max_columns = None

In [2]:
load_csv = True

In [3]:
if load_csv == True:
    df_sku = pd.read_csv('df_sku.csv')

In [4]:
if load_csv == False:
    sqluser = input('Enter SQL User')
    sqlpass = input(f'Enter Password for {sqluser}')
    ## instance a python db connection object- same form as psycopg2/python-mysql drivers also
    conn = pymssql.connect(server="192.168.254.13", user=sqluser,password=passwrd, port=1433)  # You can lookup the port number inside SQL server. 

    stmt = "SELECT \
            site_sk \
,datetran_sk \ 
,time_sk \
,salesevent_sk as transaction_id \
,master \
,parent \
,category \  
,itemcat::int as itemcat \
,plu::bigint as plu \
,itemdesc \
FROM gate.fact_trandetail td \
inner join gate.dim_tranitem ti on td.plu_sk = ti.plu_sk \
where ti.category_sk != -2 and master in ('Merchandise','QSR') \
limit 100000;"
    # Excute Query here
    df_sku = pd.read_sql(stmt,conn)
    df_sku.to_csv('df_sku.csv')

In [5]:
df_sku.head()

Unnamed: 0.1,Unnamed: 0,SKU_ID,UOM_ID,SalesCategoryID,CategoryID,NACSCategoryID,Category,Description,LongDescription,ShortDescription,POSDescription
0,0,1,119,40,40,140,General Merchandise,General Merchandise,1000 DRINKING GAMES IN A BOX,1000 DRINK GAME,1000 DRINKING GAMES
1,1,2,119,52,52,91,Salty Snacks,Salty Snacks Inv,2/$1 RED HOT SAUSAGE,MEAT SNACK RH,2/$1 RED HOT SAUSAGE
2,2,3,119,52,52,91,Salty Snacks,Salty Snacks Inv,20 COUNT VARIETY PACK,VARIETY 20CT,20 COUNT VARIETY PACK
3,3,4,119,52,52,91,Salty Snacks,Salty Snacks Inv,26 CT 100 CALORIE VARIETY PACK,VARIETY 100CAL,26CT 100 CALORIE VARIETY
4,4,5,119,52,52,91,Salty Snacks,Salty Snacks Inv,3 CHEESE MEDLEY NIBBLERS,PRETZEL 3 CHEESE,3 CHEESE MEDLEY NIBBLERS


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df_sku['LongDescription'], df_sku['Category'], train_size=0.8)

<h1>Naive Bayes Classifier</h1>

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

<h1>K-nearest Neighbor</h1>

In [None]:
from sklearn.neighbors import KNeighborsClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier()),
                     ])

text_clf.fit(X_train, y_train)

predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

<h1>Support Vector Machine (SVM)</h1>

In [None]:
from sklearn.svm import LinearSVC

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC()),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

<h1>Decision Tree</h1>

In [None]:
from sklearn import tree
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', tree.DecisionTreeClassifier()),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

<h1>Random Forest</h1>

In [None]:
from sklearn.ensemble import RandomForestClassifier

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=100)),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

<h1>Deep Neural Networks</h1>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import  Dropout, Dense
from tensorflow.keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
from sklearn import metrics

In [8]:
def TFIDF(X_train, X_test,MAX_NB_WORDS=75000):
    vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()
    print("tf-idf with",str(np.array(X_train).shape[1]),"features")
    return (X_train,X_test)

In [None]:
#prepare target
def prepare_targets_le(y_train, y_test):
    # need to make sure we force a 2D array or we'll run into trouble with LE
    y_train = y_train.to_numpy().reshape(-1,1)
    y_test = y_test.to_numpy().reshape(-1,1)
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

In [None]:
#prepare target
def prepare_targets_oe(y_train, y_test):
    oe = OrdinalEncoder()
    oe.fit(y_train)
    y_train_enc = oe.transform(y_train)
    y_test_enc = oe.transform(y_test)
    return y_train_enc, y_test_enc

In [None]:
def Build_Model_DNN_Text(shape, nClasses, dropout=0.5):
    """
    buildModel_DNN_Tex(shape, nClasses,dropout)
    Build Deep neural networks Model for text classification
    Shape is input feature space
    nClasses is number of classes
    """
    model = Sequential()
    node = 2500 # number of nodes
    nLayers = 1 # number of  hidden layer

    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [None]:
X_train_tfidf,X_test_tfidf = TFIDF(X_train,X_test)
y_train_enc, y_test_enc = prepare_targets_oe(y_train, y_test)
model_DNN = Build_Model_DNN_Text(X_train_tfidf.shape[1], 29) # 29 is df_sku['Category'].nunique()
model_DNN.fit(X_train_tfidf, y_train_enc,
                              validation_data=(X_test_tfidf, y_test_enc),
                              epochs=10,
                              batch_size=128,
                              verbose=2)

predicted = model_DNN.predict(X_test_tfidf)

print(metrics.classification_report(y_test, predicted))

In [None]:
df_sku['Category'].nunique()


In [None]:
import numpy as np
#y_train = y_train.values.reshape(-1,1)
#y_test = y_test.values.reshape(-1,1)

In [None]:
X_train_tfidf,X_test_tfidf = TFIDF(X_train,X_test)
y_train_enc, y_test_enc = prepare_targets_oe(y_train, y_test)

In [None]:
y_test_enc = y_test_enc.astype(float)

In [None]:
y_test_enc = y_test_enc.flatten()

In [None]:
y_test_enc.shape

In [None]:
predicted.shape

In [None]:
#[i for i in y_test_enc ]

In [None]:
pred = np.argmax(predicted,axis=1)

In [None]:
predicted = model_DNN.predict(X_test_tfidf)

print(metrics.classification_report(y_test_enc, pred))

In [None]:
def prepare_inputs(X_train, X_test):
    oe = OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)
    return X_train_enc, X_test_enc

In [19]:
from sklearn.preprocessing import LabelEncoder
# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

In [None]:
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)

In [None]:
# define the model
model = Sequential()
model.add(Dense(10, input_dim=X_train_enc.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=100, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

In [9]:
from tpot import TPOTClassifier

In [10]:
pipeline_optimizer = TPOTClassifier()

In [11]:
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df_sku['LongDescription'], df_sku['Category'], train_size=0.8)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
X_train, X_test = TFIDF(X_train, X_test,MAX_NB_WORDS=75000)
X_train

tf-idf with 5114 features


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
y_train, y_test = prepare_targets(y_train, y_test)

In [14]:
pipeline_optimizer.fit(X_train, y_train)



HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=120.0, style=ProgressStyle(de…

Generation 1 - Current best internal CV score: 0.8899660332111144
Generation 2 - Current best internal CV score: 0.8899660332111144
Generation 3 - Current best internal CV score: 0.9005221863662476
Generation 4 - Current best internal CV score: 0.9005221863662476
Generation 5 - Current best internal CV score: 0.9005221863662476

Best pipeline: LinearSVC(input_matrix, C=5.0, dual=True, loss=hinge, penalty=l2, tol=1e-05)


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=5,
               max_eval_time_mins=5, max_time_mins=None, memory=None,
               mutation_rate=0.9, n_jobs=1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=20,
               random_state=42, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=False)

In [21]:
print(pipeline_optimizer.score(X_test, y_test))

0.0


In [16]:
pipeline_optimizer.export('tpot_exported_pipeline.py')