In [9]:
import pandas as pd
import numpy as np
import json

#for text pre-processing
import re, string
import nltk
from nltk import pos_tag, word_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer



nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#for word embedding
import gensim
from gensim.models import Word2Vec #Word2Vec is mostly used for huge datasets


#Functions to clean training data
lemmatizer = WordNetLemmatizer()


def detokenize(command):
    #Merge string back together after splitting
    return TreebankWordDetokenizer().detokenize(command)

def clean_command(command):
    """Cleans the incoming command by removing email and punctuation and sets to lowercase"""
    
    #Remove email and join string back togehter 
    clean_string = [word for word in command.split() if '@' not in word]
    clean_string = detokenize(clean_string)

    #Remove punctuation from string
    for char in string.punctuation:
        if char in clean_string:
            clean_string = clean_string.replace(char, '')

    return clean_string.lower()


def lemmatize(command):
    tokens = command.split()
    tagged_tokens = pos_tag(tokens)
    
    lemmatized_tokens = []
    
    #Must lemmatize nouns and verb seperatley because nltk cant do both at once
    for token, pos in tagged_tokens:
        if pos.startswith('N'): 
            lemma = lemmatizer.lemmatize(token, pos='n')
            
        elif pos.startswith('V'): 
            lemma = lemmatizer.lemmatize(token, pos='v')
            
        else:
            lemma = token
            
        lemmatized_tokens.append(lemma)
 
    return detokenize(lemmatized_tokens)


def command_prepper(command):
    """Handles all function neccesary to prepping a raw command
        to be converted into a vector"""
    
    cleaned_command = clean_command(command)
    lemmatized_command = lemmatize(cleaned_command)
    
    return lemmatized_command


#command_prepper("Hey... howare yall.,.!! 123@mgail.com")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sbuca\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sbuca\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sbuca\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [32]:
data = pd.read_excel("model_data.xlsx", sheet_name="data")

#Preprocessor words to numbers
data['clean_text'] = data['input'].apply(lambda x: command_prepper(x))
data['clean_text_tok']=[nltk.word_tokenize(i) for i in data['clean_text']]


#Initiate model
model = Word2Vec(data['clean_text_tok'],min_count=1) 
w2v = dict(zip(model.wv.index_to_key, model.wv.vectors))


#Split Data
X_train, X_val, y_train, y_val = train_test_split(data["clean_text"],
                                                  data["output"],
                                                  test_size=0.2,
                                                  shuffle=True)
X_train_tok= [nltk.word_tokenize(i) for i in X_train]  #for word2vec
X_val_tok= [nltk.word_tokenize(i) for i in X_val]      #for word2vec


#Token stuff
#TF-IDF
tfidf_vectorizer = TfidfVectorizer(use_idf=True)

X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 

X_val_vectors_tfidf = tfidf_vectorizer.transform(X_val)


#Word2vec
# Fit and transform
modelw = MeanEmbeddingVectorizer(w2v)
X_train_vectors_w2v = modelw.transform(X_train_tok)
X_val_vectors_w2v = modelw.transform(X_val_tok)

In [34]:
#FITTING THE CLASSIFICATION MODEL using Naive Bayes(tf-idf)
#It's a probabilistic classifier that makes use of Bayes' Theorem, a rule that uses probability to make predictions based on prior knowledge of conditions that might be related. This algorithm is the most suitable for such large dataset as it considers each feature independently, calculates the probability of each category, and then predicts the category with the highest probability.

nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_vectors_tfidf, y_train)  #model

#Predict y value for test dataset
y_predict = nb_tfidf.predict(X_val_vectors_tfidf)
y_prob = nb_tfidf.predict_proba(X_val_vectors_tfidf)[:,1]
 

print(classification_report(y_val,y_predict))
print('Confusion Matrix:',confusion_matrix(y_val, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_val, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)  

              precision    recall  f1-score   support

           1       1.00      0.33      0.50         3
           2       0.00      0.00      0.00         0

    accuracy                           0.33         3
   macro avg       0.50      0.17      0.25         3
weighted avg       1.00      0.33      0.50         3

Confusion Matrix: [[1 2]
 [0 0]]
AUC: nan


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [52]:
#Testing it on new dataset with the best model
df_test=pd.read_excel('forward_testing.xlsx')  #reading the data

df_test['clean_text'] = df_test['text'].apply(lambda x: command_prepper(x)) #preprocess the data
X_test=df_test['clean_text'] 
X_vector=tfidf_vectorizer.transform(X_test) #converting X_test to vector
y_predict = lr_tfidf.predict(X_vector)      #use the trained model on X_vector
y_prob = lr_tfidf.predict_proba(X_vector)[:,1]
df_test['predict_prob']= y_prob
df_test['target']= y_predict
print(df_test.head())
final=df_test[['id','target']].reset_index(drop=True)
final.to_csv('submission.csv')

   id                             text               clean_text  predict_prob  \
0   1  send and email to 123@gmail.com        send and email to      0.247498   
1   2          order dunkin from boost  order dunkin from boost      0.089632   

   target  
0       1  
1       3  


In [50]:
df = pd.read_excel('forward_testing.xlsx')

In [51]:
df

Unnamed: 0,id,text
0,1,send and email to 123@gmail.com
1,2,order dunkin from boost


In [49]:
df_test

Unnamed: 0,id,text,clean_text,predict_prob,target
0,1,send and email to 123@gmail.com,send and email to,0.247498,1
