In [1]:
import pandas as pd
import numpy as np
import nltk
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [2]:
import os
os.chdir("C:\\Users\\subhankar.pattnaik\\Downloads\\Personal\AI\\Code\\ML-AI-DS-master")

In [111]:
#Set Random seed
np.random.seed(500)
# Add the Data using pandas
df = pd.read_csv('data/latest_ticket_data.csv')


In [113]:
# Step - 1: Data Pre-processing - This will help in getting better results through the classification algorithms

# Step - 1a : Remove blank rows if any.
df['Description'].dropna(inplace=True)

# Step - 1b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
df['Description'] = [entry.lower() for entry in df['Description']]

# Step - 1c : Tokenization : In this each entry in the corpus will be broken into set of words
df['Description'] = [word_tokenize(entry) for entry in df['Description']]


In [None]:
df['desc_final'] = df['Description'].apply(lambda x: cleaning(x))

In [7]:
df['Description']

0       [hi, since, recruiter, lead, permission, appro...
1       [re, expire, days, hi, ask, help, update, pass...
3       [please, dear, looks, blacklisted, receiving, ...
4       [dear, modules, report, report, cost, thank, m...
5       [please, action, reports, dear, way, help, clo...
6                             [hello, please, fill, date]
7       [secondary, hi, please, provide, secondary, le...
8       [copy, hi, receive, copy, behavior, expect, re...
9       [invitation, strategy, workshop, february, man...
10      [change, owner, contract, hello, please, log, ...
11      [reports, amended, hello, please, call, best, ...
12      [user, accessibility, hello, please, log, assi...
13      [re, annual, leave, hello, please, help, absen...
14      [change, role, hello, dear, please, change, pm...
15      [hello, received, workstation, granted, ad, pl...
16      [undelivered, returned, hi, getting, these, un...
17      [mailbox, almost, hi, mailbox, almost, kind, t...
18      [oppor

In [24]:
# Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV


for index,entry in enumerate(df['Description']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df.loc[index,'desc_final'] = str(Final_words)




In [25]:
print(df['desc_final'].head())

0    ['hi', 'since', 'recruiter', 'lead', 'permissi...
1    ['expire', 'day', 'hi', 'ask', 'help', 'update...
2    ['verification', 'warn', 'hi', 'get', 'attach'...
3    ['please', 'dear', 'look', 'blacklist', 'recei...
4    ['dear', 'module', 'report', 'report', 'cost',...
Name: desc_final, dtype: object


In [26]:
# Step - 2: Split the model into Train and Test Data set
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df['desc_final'],df['Category'],test_size=0.3)

# Step - 3: Label encode the target variable  - This is done to transform Categorical data of string type in the data set into numerical values
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

# Step - 4: Vectorize the words by using TF-IDF Vectorizer - This is done to find how important a word in document is in comparison to the corpus
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['desc_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)


In [35]:
# Step - 5: Now we can run different algorithms to classify out data check for accuracy

# Classifier - Algorithm - Naive Bayes
# fit the training dataset on the classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  72.66666666666667


In [36]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)


SVM Accuracy Score ->  75.0


In [124]:
#Test Data
df_test = pd.read_csv('data/test.csv')

In [125]:
def cleaning(df):
    df['Description'].dropna(inplace=True)
    df['Description'] = [entry.lower() for entry in df['Description']]
    df['Description'] = [word_tokenize(entry) for entry in df['Description']]
    for index,entry in enumerate(df['Description']):
        Final_words = []
        word_Lemmatized = WordNetLemmatizer()
        for word, tag in pos_tag(entry):
            if word not in stopwords.words('english') and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                Final_words.append(word_Final)
        df.loc[index,'desc_final'] = str(Final_words)
    return df

def vectorize(df):
    return Tfidf_vect.transform(df['desc_final'])

In [None]:
def cleaning(text):
    words = text.spilt()
    words = [entry.lower() for entry in words]
    

In [126]:
df_test = cleaning(df_test)

In [129]:
Test_TfIdf = vectorize(df_test)

In [131]:
# Naive Bayes
Naive.predict(Test_TfIdf)

array([3])

In [132]:
# SVM
SVM.predict(Test_TfIdf)

array([3])