In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

In [2]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim.models.lsimodel import LsiModel
from gensim import corpora
from pprint import pprint
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/raisaurabh04/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
path_train = '/Users/raisaurabh04/OneDrive/GreyAtom/Practice Dataset/domain_classification_haptik_train.csv'

In [4]:
path_test = '/Users/raisaurabh04/OneDrive/GreyAtom/Practice Dataset/domain_classification_haptik_test.csv'

In [5]:
df = pd.read_csv(path_train)

In [6]:
df.head()

Unnamed: 0,message,food,recharge,support,reminders,travel,nearby,movies,casual,other
0,7am everyday,F,F,F,T,F,F,F,F,F
1,chocolate cake,T,F,F,F,F,F,F,F,F
2,closed mortice and tenon joint door dimentions,F,F,T,F,F,F,F,F,F
3,train eppo kelambum,F,F,F,F,T,F,F,F,F
4,yesterday i have cancelled the flight ticket,F,F,F,F,T,F,F,F,F





Action
- We need to do an informal reverse of 'one hot encoding'.

- Define a function label_race() which takes an argument row as input. This function should check every row for a category that is marked as T and return the name of the category.

- Create a new column category which contains the values obtained by applying the above written function to all the rows of the dataframe. (Hint: use df.apply())

- Drop the columns of food, recharge, support, reminders, nearby, movies, casual, other and travel from the dataframe df

In [7]:
# Creating a new column called category which has the column marked as true for that particular message. 
def label_race(row):
    for col in ['message','food', 'recharge', 'support', 'reminders', 'travel', 'nearby', 'movies', 'casual', 'other']:
        if row[col] == 'T':
            return col

#%%timeit
df['category'] = df.apply(label_race, axis=1)

# Dropping all other columns except the category column
df.drop(columns=['food', 'recharge', 'support', 'reminders', 'travel', 'nearby', 'movies', 'casual', 'other'], 
        inplace=True)

In [8]:
df.head()

Unnamed: 0,message,category
0,7am everyday,reminders
1,chocolate cake,food
2,closed mortice and tenon joint door dimentions,support
3,train eppo kelambum,travel
4,yesterday i have cancelled the flight ticket,travel





<h3>Data Processing</h3>

As we have seen in the Text Analytics concepts we need to convert this textual data into vectors so that we can apply machine learning algorithms to them. In this task we will now employ a normal TF-IDF vectorizer to vectorize the message column and label encode the category column, essentially making it a classification problem.

Instructions
- Since working on large data(40000 rows) will be time consuming, we have sampled 'df' to contain 1000 values of each category(Code already given)

- Create a variable all_text and save all the values of column message converted in lower case into it

- Instantiate "TfidfVectorizer()" with argument stop_words="english"

- Fit the above instantiated Tfidf Vectorizer object on all_text

- Transform all_text using the above fitted model, convert it to an array and save the transformed array to a variable `X'

- Instatiate a LabelEncoder() object.

- fit the above instantiated model on category column of dataframe df

- transform the 'category' column using the above fitted model and save the same to variable 'y'

In [9]:
# Sampling only 1000 samples of each category
df = df.groupby('category').apply(lambda x: x.sample(n=1000, random_state=0))

# Converting all messages to lower case and storing it
all_text = df['message'].apply(lambda x : x.lower())

# Initialising TF-IDF object
tf_idf = TfidfVectorizer(stop_words='english')

# Vectorizing data
X = tf_idf.fit_transform(all_text)

# Initiating a label encoder object
label_encoder = LabelEncoder()

# Transforming the data and storing it
y = label_encoder.fit_transform(df['category'])





<h3>Classification implementation</h3>

In the previous tasks we have cleaned the data and converted the textual data into numbers in order to enable us to apply machine learning models. In this task we will apply Logistic Regression , Naive Bayes and Lienar SVM model onto the data.

Instructions

- Split the feature array X and target variable y into training and validation sets using train_test_split() method and save the variables in X_train,X_val,y_train and y_val. Pass the parameter test_size=0.3, random_state=42 for the same

- Instantiate a LogisticRegression(random_state=0) model and save it to variable log_reg

- Fit the log_reg model on X_train and y_train

- Predict the values using the above fitted model for X_val and store the same in y_pred

- Calculate the accuracy score, store the same in variable log_accuracy and print the same

- Repeat the above steps for MultinomialNB() model instantiated as nb, save the accuracy score in nb_accuracy

- Similarly, repeat the above steps for LinearSVC(random_state=0) model instantiated as lsvm, save the accuracy score in lsvm_accuracy

In [10]:
# Splitting the data into train and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# Implementing Logistic Regression model
log_reg = LogisticRegression(random_state=0)

log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_val)

log_accuracy = log_reg.score(X_val, y_val)



In [12]:
# Implementing Multinomial NB model
nb = MultinomialNB()

nb.fit(X_train, y_train)

y_pred = nb.predict(X_val)

nb_accuracy = nb.score(X_val, y_val)

In [13]:
# Implementing Linear SVM model
lsvm = LinearSVC(random_state=0)

lsvm.fit(X_train, y_train)

y_pred = lsvm.predict(X_val)
lsvm_accuracy = lsvm.score(X_val, y_val)

In [14]:
log_accuracy, nb_accuracy, lsvm_accuracy

(0.7085185185185185, 0.7114814814814815, 0.7125925925925926)






<h3>Validation of test data</h3>
Let's now see how well our models run on test set.

Instructions
- The dataframe 'df_test' containing the test set has already been loaded and cleaned for you(Code given).

- Create a variable all_text and save all the values of column message of 'df_test' converted in lower case into it.

- Transform all_text using the previously fitted model tfidf, convert it into array and save the transformed array to a variable `X_test'

- Transform the category column of 'df_test' using the previously created le object and save the same to variable y_test

- Predict the values using the previously fitted model of logistic regression log_reg for X_test and store the same in y_pred

- Calculate the accuracy score, store the same in variable log_accuracy_2 and print the same

- Repeat the last two steps for MultinomialNB() model fitted as nb, save the accuracy score in nb_accuracy_2

In [15]:
#Loading the dataframe
df_test = pd.read_csv(path_test)

#Creating the new column category
df_test["category"] = df_test.apply (lambda row: label_race (row),axis=1)

#Dropping the other columns
drop= ["food", "recharge", "support", "reminders", "nearby", "movies", "casual", "other", "travel"]
df_test=  df_test.drop(drop,1)

In [16]:
df_test.head()

Unnamed: 0,message,category
0,Nearest metro station,nearby
1,Pick up n drop service trough cab,travel
2,I wants to buy a bick,other
3,Show me pizza,food
4,What is the cheapest package to andaman and ni...,travel


In [17]:
#converting the reviews into lower case
all_text = df_test['message'].str.lower()

# Transforming using the tfidf object - tfidf
X_test = tf_idf.transform(all_text).toarray()

# Transforming using label encoder object - le
y_test = label_encoder.transform(df_test['category'])

# Predicting using the logistic regression model - logreg
y_pred = log_reg.predict(X_test)
log_accuracy_2 = log_reg.score(X_test, y_test)

# Predicting using the naive bayes model - nb
y_pred = nb.predict(X_test)
nb_accuracy_2 = nb.score(X_test, y_test)

# Predicting using the linear svm model - lsvm
y_pred = lsvm.predict(X_test)
lsvm_accuracy_2 = lsvm.score(X_test, y_test)








<h3>LSA Modeling</h3>
In this task, we will try to see how to use LSI on the entire dataset.

Instructions
- A cleaned list called doc_clean containing data of message column is already given.

- Create a dictionary from 'doc_clean' using "corpora.Dictionary()" and store it in a variable called 'dictionary'

- Create a word corpus of 'dictionary' using "doc2bow()" method and store the result in a variable called 'doc_term_matrix'

- Initialise the LSI model "LsiModel()" with the parameters corpus=doc_term_matrix, num_topics=5, id2word=dictionary and store it in lsimodel.

- Print(Use pprint()) the 5 topics using "print_topics()" method of lsimodel and have a look at it.

In [18]:
# Creating a stopwords list
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
# Function to lemmatize and remove the stopwords
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

# Creating a list of documents from the complaints column
list_of_docs = df["message"].tolist()

# Implementing the function for all the complaints of list_of_docs
doc_clean = [clean(doc).split() for doc in list_of_docs]

# Code starts here

In [19]:
# Creating the dictionary id2word from our cleaned word list doc_clean
dictionary = corpora.Dictionary(documents=doc_clean)

# Creating the corpus
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# Creating the LSi model
lsimodel = LsiModel(corpus=doc_term_matrix, num_topics=5, id2word=dictionary)
pprint(lsimodel.print_topics())

[(0,
  '0.347*"reminder" + 0.267*"like" + 0.267*"cancel" + 0.266*"would" + '
  '0.256*"userid" + 0.256*"apiname" + 0.256*"offset" + 0.256*"exotel" + '
  '0.255*"reminderlist" + 0.255*"taskname"'),
 (1,
  '0.831*"want" + 0.221*"u" + 0.187*"know" + 0.181*"movie" + 0.135*"book" + '
  '0.128*"ticket" + 0.114*"need" + 0.107*"hi" + 0.095*"please" + '
  '0.092*"service"'),
 (2,
  '0.451*"reminder" + -0.328*"call" + -0.316*"u" + -0.233*"wake" + '
  '0.205*"water" + -0.197*"march" + -0.192*"wakeup" + 0.185*"every" + '
  '0.181*"drink" + 0.168*"want"'),
 (3,
  '0.611*"u" + -0.419*"want" + 0.244*"need" + 0.238*"reminder" + '
  '0.197*"please" + 0.143*"movie" + 0.117*"service" + -0.102*"wake" + '
  '0.101*"near" + 0.101*"help"'),
 (4,
  '-0.621*"need" + 0.510*"u" + -0.492*"movie" + -0.190*"offer" + 0.137*"want" '
  '+ -0.115*"ticket" + -0.058*"know" + -0.051*"today" + 0.051*"find" + '
  '-0.049*"book"')]






<h3>LDA Modeling</h3>

Next let's try to do topic modeling using LDA. We will first find the optimum no. of topics using coherence score and then create a model attaining to the optimum no. of topics.

- A function called "compute_coherence_values" that computes and returns different coherence values along with the no. of topics is already defined

- Call the function "compute_coherence_values" with the parameters passed dictionary=dictionary, corpus=doc_term_matrix, texts=doc_clean, start=1, limit=41, step=5. (We want to check the coherence score for topic num ranging from 1 to 40 with a gap of 5). Store the return of the function in variables 'topic_list' & 'coherence_value_list'

- Find the no. of topics from 'topic_list' associated with the maximum coherence score of 'coherence_value_list'(Hint: They will have same index) and store that no. in a variable called 'opt_topic'.

- Print 'opt_topic' to take a look at the optimum no. of topics.

- Initialize LdaModel() with the parameters corpus=doc_term_matrix, num_topics=opt_topic, id2word = dictionary, iterations=10 , passes=30& random_state=0 and store it in 'lda_model'.

- Print(use pprint() instead of print()) five of the topics using "print_topics(5)" method of lda_model and have a look at it.

In [20]:
from gensim.models import LdaModel
from gensim.models import CoherenceModel

# doc_term_matrix - Word matrix created in the last task
# dictionary - Dictionary created in the last task

# Function to calculate coherence values
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    topic_list : No. of topics chosen
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    topic_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(doc_term_matrix, random_state = 0, num_topics=num_topics, id2word = dictionary, iterations=10)
        topic_list.append(num_topics)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return topic_list, coherence_values

In [21]:
topic_list, coherence_value_list = compute_coherence_values(dictionary=dictionary, corpus=doc_term_matrix, texts=doc_clean, start=1, limit=41, step=5)

opt_topic = topic_list[np.argmax(coherence_value_list)]

print(opt_topic)

topic_list, coherence_value_list

lda_model = LdaModel(corpus=doc_term_matrix, num_topics=opt_topic, id2word=dictionary, iterations=10, passes=30, random_state=0)

pprint(lda_model.print_topics(5))

  if not i.flags.writeable or i.dtype not in (np.int32, np.int64):
  if not j.flags.writeable or j.dtype not in (np.int32, np.int64):


36
[(12,
  '0.221*"show" + 0.123*"center" + 0.112*"service" + 0.069*"discount" + '
  '0.028*"cake" + 0.026*"message" + 0.024*"paytm" + 0.014*"surat" + '
  '0.012*"whether" + 0.011*"previous"'),
 (8,
  '0.190*"find" + 0.136*"give" + 0.118*"booking" + 0.073*"u" + 0.033*"nearest" '
  '+ 0.032*"full" + 0.026*"okay" + 0.021*"take" + 0.018*"star" + '
  '0.015*"receipt"'),
 (14,
  '0.174*"detail" + 0.048*"received" + 0.047*"note" + 0.042*"mean" + '
  '0.034*"ask" + 0.026*"branch" + 0.025*"people" + 0.022*"u" + 0.018*"parlour" '
  '+ 0.016*"gud"'),
 (28,
  '0.351*"please" + 0.090*"fare" + 0.083*"good" + 0.044*"suggest" + '
  '0.033*"review" + 0.033*"provide" + 0.031*"u" + 0.030*"see" + 0.029*"office" '
  '+ 0.016*"abt"'),
 (4,
  '0.226*"number" + 0.081*"shop" + 0.064*"store" + 0.060*"hyderabad" + '
  '0.032*"sunday" + 0.025*"great" + 0.022*"3rd" + 0.020*"u" + 0.017*"window" + '
  '0.015*"door"')]
