In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib as mpl
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import gensim
from gensim import corpora, models, similarities
import pyLDAvis.gensim
from collections import defaultdict
import string

In [2]:
# Read data
df=pd.read_csv('sample.csv')

#preview data
df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0


In [3]:
# join request and response into the same row: 'text_x' is the response from customer services; 'text_y' is the request
def pre(df,author):
    a=df[df.author_id==author]
    a=a.merge(df.loc[:,['tweet_id','text']],left_on='in_response_to_tweet_id',right_on='tweet_id')
    a=a[a.response_tweet_id.isnull()]    
    a['text_x']=a.text_x.apply(lambda x: x[:x.find('@')]+x[x.find('@')+8:])    
    return a

In [4]:
amazon = []
amazon.append(pre(df,'AppleSupport'))
amazon

[    tweet_id_x     author_id  inbound                      created_at  \
 0       119248  AppleSupport    False  Wed Oct 11 13:38:29 +0000 2017   
 1       119252  AppleSupport    False  Wed Oct 11 13:40:27 +0000 2017   
 2       119262  AppleSupport    False  Wed Oct 11 13:30:39 +0000 2017   
 3       119267  AppleSupport    False  Wed Oct 11 13:30:38 +0000 2017   
 4       119269  AppleSupport    False  Wed Oct 11 13:30:12 +0000 2017   
 6       119279  AppleSupport    False  Wed Oct 11 13:35:01 +0000 2017   
 8       119293  AppleSupport    False  Wed Oct 11 13:30:00 +0000 2017   
 9       119298  AppleSupport    False  Wed Oct 11 13:34:00 +0000 2017   
 10      119300  AppleSupport    False  Wed Oct 11 13:31:27 +0000 2017   
 11      119323  AppleSupport    False  Wed Oct 11 13:55:31 +0000 2017   
 
                                                text_x response_tweet_id  \
 0   We can help. Which version of iOS are you on? ...               NaN   
 1   Thanks for reaching out to 

In [5]:
# append all the questions about amazon to a list
q = amazon[-1]['text_y'].to_list()
q

['@105838 @AppleSupport Me too am suffering , hope the can find a solution',
 'I just updated my phone and suddenly everything takes ages to load wtf @76099 this update sux I hate it fix it bye',
 '@AppleSupport after the 11.0.2 my phone just sucks most of the apps are broken, wifi disconnects frequently #apple #ios1102 #painfulupdate',
 'Okay @76099 I used my fucking phone for 2 minutes and it drains it down 8 fucking percent',
 '@AppleSupport Can you get my iPhone 7plus back on the old iOS please?  Battery runs out in half the time, apps now frequently crash.',
 'So the new @76099 update does not let me listen to music and go on whatsapp at the same time?!?',
 'Took my phone off charge at 7:20am.\n\n8:03am - 60% battery remaining.\n\n@76099 plz I beg you, sort your battery life out😩',
 '@AppleSupport I need a new code for my I-store. I haven’t recd any but msg is too many sent. Help!',
 '@76099 @AppleSupport fix this update. It’s horrible',
 '@AppleSupport I have the latest version i

In [6]:
#remove '\r' and punctuations 
new_text = []
for tweet in q: 
    for i in tweet:
        if i in string.punctuation or i == '@':
            tweet = tweet.replace(i,'') #replace punctuation with nothing
        if i == '\r':
            tweet = tweet.replace(i,' ') #replace \r with space
    new_text.append(tweet) 
new_text

['105838 AppleSupport Me too am suffering  hope the can find a solution',
 'I just updated my phone and suddenly everything takes ages to load wtf 76099 this update sux I hate it fix it bye',
 'AppleSupport after the 1102 my phone just sucks most of the apps are broken wifi disconnects frequently apple ios1102 painfulupdate',
 'Okay 76099 I used my fucking phone for 2 minutes and it drains it down 8 fucking percent',
 'AppleSupport Can you get my iPhone 7plus back on the old iOS please  Battery runs out in half the time apps now frequently crash',
 'So the new 76099 update does not let me listen to music and go on whatsapp at the same time',
 'Took my phone off charge at 720am\n\n803am  60 battery remaining\n\n76099 plz I beg you sort your battery life out😩',
 'AppleSupport I need a new code for my Istore I haven’t recd any but msg is too many sent Help',
 '76099 AppleSupport fix this update It’s horrible',
 'AppleSupport I have the latest version iOS It started immediately after I upd

In [7]:
#remove stop words, to lowercase and tokenize
mystopwords = stopwords.words()
tokens_list = [[word for word in tweet.lower().split(' ') if word not in mystopwords and word.isalpha() and word != 'amazon' and word != 'amazonhelp']
         for tweet in new_text]
tokens_list

[['applesupport', 'suffering', 'hope', 'find', 'solution'],
 ['updated',
  'phone',
  'suddenly',
  'takes',
  'ages',
  'load',
  'wtf',
  'update',
  'sux',
  'hate',
  'fix',
  'bye'],
 ['applesupport',
  'phone',
  'sucks',
  'apps',
  'broken',
  'wifi',
  'disconnects',
  'frequently',
  'apple',
  'painfulupdate'],
 ['fucking', 'phone', 'minutes', 'drains', 'fucking', 'percent'],
 ['applesupport',
  'iphone',
  'back',
  'ios',
  'battery',
  'runs',
  'half',
  'time',
  'apps',
  'frequently',
  'crash'],
 ['update', 'listen', 'music', 'whatsapp', 'time'],
 ['phone', 'charge', 'battery', 'plz', 'beg', 'sort', 'battery', 'life'],
 ['applesupport', 'code', 'istore', 'recd', 'msg'],
 ['applesupport', 'fix', 'update', 'horrible'],
 ['applesupport',
  'latest',
  'version',
  'ios',
  'started',
  'immediately',
  'updated',
  'phone']]

In [8]:
#remove words that appear only once
frequency = defaultdict(int)

for tokens in tokens_list:
    for token in tokens:
        frequency[token] += 1      
tokens_list = [[token for token in tokens if frequency[token]>1]
              for tokens in tokens_list]
tokens_list

[['applesupport'],
 ['updated', 'phone', 'update', 'fix'],
 ['applesupport', 'phone', 'apps', 'frequently'],
 ['fucking', 'phone', 'fucking'],
 ['applesupport', 'ios', 'battery', 'time', 'apps', 'frequently'],
 ['update', 'time'],
 ['phone', 'battery', 'battery'],
 ['applesupport'],
 ['applesupport', 'fix', 'update'],
 ['applesupport', 'ios', 'updated', 'phone']]

In [9]:
#Generate Term Document Matrix
# generate token dictionary class
dictionary = corpora.Dictionary(tokens_list) 
dictionary

<gensim.corpora.dictionary.Dictionary at 0x7f69c103fdf0>

In [10]:
# generate a unique token list 
sort_token = sorted(dictionary.items(),key=lambda k:k[0], reverse = False)
unique_token = [token for (ID,token) in sort_token]

# build a corpus
corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]
corpus

[[(0, 1)],
 [(1, 1), (2, 1), (3, 1), (4, 1)],
 [(0, 1), (2, 1), (5, 1), (6, 1)],
 [(2, 1), (7, 2)],
 [(0, 1), (5, 1), (6, 1), (8, 1), (9, 1), (10, 1)],
 [(3, 1), (10, 1)],
 [(2, 1), (8, 2)],
 [(0, 1)],
 [(0, 1), (1, 1), (3, 1)],
 [(0, 1), (2, 1), (4, 1), (9, 1)]]

In [11]:
# Save a Term Document Matrix
matrix = gensim.matutils.corpus2dense(corpus,num_terms=len(dictionary),dtype = 'int')
matrix = matrix.T 
#transpose the matrix 

#convert the numpy matrix into pandas dataframe
matrix_df = pd.DataFrame(matrix, columns=unique_token)
matrix_df

Unnamed: 0,applesupport,fix,phone,update,updated,apps,frequently,fucking,battery,ios,time
0,1,0,0,0,0,0,0,0,0,0,0
1,0,1,1,1,1,0,0,0,0,0,0
2,1,0,1,0,0,1,1,0,0,0,0
3,0,0,1,0,0,0,0,2,0,0,0
4,1,0,0,0,0,1,1,0,1,1,1
5,0,0,0,1,0,0,0,0,0,0,1
6,0,0,1,0,0,0,0,0,2,0,0
7,1,0,0,0,0,0,0,0,0,0,0
8,1,1,0,1,0,0,0,0,0,0,0
9,1,0,1,0,1,0,0,0,0,1,0


In [12]:
#fit to LDA model
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10) 

#Topic matrix (V matrix)
lda.print_topics(10) 

[(0,
  '0.091*"applesupport" + 0.091*"phone" + 0.091*"update" + 0.091*"battery" + 0.091*"time" + 0.091*"fucking" + 0.091*"updated" + 0.091*"frequently" + 0.091*"apps" + 0.091*"fix"'),
 (1,
  '0.155*"applesupport" + 0.155*"battery" + 0.155*"time" + 0.155*"frequently" + 0.155*"apps" + 0.155*"ios" + 0.014*"phone" + 0.014*"update" + 0.014*"fucking" + 0.014*"fix"'),
 (2,
  '0.524*"applesupport" + 0.048*"phone" + 0.048*"update" + 0.048*"battery" + 0.048*"fucking" + 0.048*"time" + 0.048*"fix" + 0.048*"updated" + 0.048*"apps" + 0.048*"frequently"'),
 (3,
  '0.412*"fucking" + 0.216*"phone" + 0.216*"applesupport" + 0.020*"update" + 0.020*"battery" + 0.020*"fix" + 0.020*"apps" + 0.020*"ios" + 0.020*"updated" + 0.020*"time"'),
 (4,
  '0.091*"applesupport" + 0.091*"phone" + 0.091*"update" + 0.091*"fix" + 0.091*"fucking" + 0.091*"time" + 0.091*"frequently" + 0.091*"battery" + 0.091*"ios" + 0.091*"apps"'),
 (5,
  '0.296*"update" + 0.155*"phone" + 0.155*"fix" + 0.155*"updated" + 0.155*"time" + 0.014*"

In [13]:
# Generate U Matrix for LDA model
corpus_lda = lda[corpus] 
#transform lda model

#convert corpus_lda to numpy matrix
U_matrix_lda = gensim.matutils.corpus2dense(corpus_lda,num_terms=10).T

#write U_matrix into pandas dataframe and output
U_matrix_lda_df = pd.DataFrame(U_matrix_lda)
U_matrix_lda_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.05,0.050006,0.549968,0.050008,0.05,0.05,0.050008,0.05,0.05001,0.05
1,0.02,0.02,0.02,0.020002,0.02,0.819988,0.020005,0.02,0.020005,0.02
2,0.02,0.020007,0.020005,0.020003,0.02,0.020001,0.819982,0.02,0.020002,0.02
3,0.025,0.025,0.025,0.774993,0.025,0.025002,0.025005,0.025,0.025,0.025
4,0.014286,0.871419,0.014289,0.014287,0.014286,0.014286,0.014289,0.014286,0.014287,0.014286
5,0.033333,0.033341,0.033333,0.033333,0.033333,0.699988,0.033333,0.033333,0.033338,0.033333
6,0.025,0.025004,0.025,0.025002,0.025,0.025001,0.774993,0.025,0.025,0.025
7,0.05,0.050006,0.549968,0.050008,0.05,0.05,0.050008,0.05,0.05001,0.05
8,0.025,0.025002,0.025006,0.025002,0.025,0.025006,0.025002,0.025,0.774982,0.025
9,0.02,0.020004,0.020005,0.020003,0.02,0.020004,0.819982,0.02,0.020002,0.02


In [14]:
print (matrix_df.shape)
print (U_matrix_lda_df.shape)

(10, 11)
(10, 10)


In [15]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda, corpus, dictionary)

In [16]:
pip install jieba

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [17]:
import pandas as pd
import matplotlib as mpl
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import time
import jieba
import codecs
import gc
import tqdm
import gensim
from gensim import corpora, models, similarities
import pyLDAvis.gensim
from collections import defaultdict
import string

In [18]:
import warnings
warnings.filterwarnings("ignore")

In [19]:
#generate class for segmentation
class Seg(object):
#     stopword_filepath = "stopword.txt"

    def __init__(self):
        self.stopwords = set()
#         self.read_in_stopword()

    def read_in_stopword(self):
        file_obj = codecs.open(self.stopword_filepath, 'r', 'utf-8')
        while True:
            line = file_obj.readline()
            line=line.strip('\r\n')
            if not line:
                break
            self.stopwords.add(line)
        file_obj.close()

    #tokenize, remove stop words, and stemming using Porter Stemmer  
    def cut(self, sentence, stopword= False, stemming = True):
        seg_list = nltk.word_tokenize(sentence)
        results = []
        if stopword:
            for seg in seg_list:
                if seg in self.stopwords:
                    continue
                if seg.isalpha():
                    results.append(seg)
        else:
            results=[token for token in seg_list if token.isalpha()]
        if stemming:
            porter = nltk.PorterStemmer()
            results=[porter.stem(token.lower()) for token in results]
        return results

In [20]:
#generate class for sentences
class Sentence(object):
    def __init__(self, sentence, seg, id=0):
        self.id = id
        self.origin_sentence = sentence
        self.cuted_sentence = self.cut(seg)

    # sentence segmentation
    def cut(self, seg):
        return seg.cut(self.origin_sentence)

    # get words after sentence segmentation
    def get_cuted_sentence(self):
        return self.cuted_sentence

    def get_origin_sentence(self):
        return self.origin_sentence

    # set scores for sentences
    def set_score(self, score):
        self.score = score

In [31]:
#generate class for calculating similarity
class SentenceSimilarity():
    def __init__(self, seg, csName):
        self.seg = seg
        self.csName = csName.lower()

    def set_sentences(self, sentences):
        self.sentences = []
        for i in range(0, len(sentences)):
            self.sentences.append(Sentence(sentences[i], self.seg, i))

    # get words after sentence segmentation
    def get_cuted_sentences(self):
        cuted_sentences = []
        for sentence in self.sentences:
            cuted_sentences.append(sentence.get_cuted_sentence())
        return cuted_sentences

    # using basic model to build complicated models
    def simple_model(self, min_frequency = 1):
        self.texts = self.get_cuted_sentences()

        # remove words with lowest frequency
        frequency = defaultdict(int)
        for text in self.texts:
            for token in text:
                frequency[token] += 1        
        self.texts = [[token for token in text if (frequency[token] > min_frequency) and (token != self.csName)] for text in self.texts]
        # generate dictionary class
        self.dictionary = corpora.Dictionary(self.texts)
        # build a corpus
        self.corpus_simple = [self.dictionary.doc2bow(text) for text in self.texts]
    
    def average_word_vectors(self,words, model, vocabulary, num_features):    
        feature_vector = np.zeros((num_features,),dtype="float64")
        nwords = 0.
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)
        return feature_vector 
   
    # averaged word vector features 
    def averaged_word_vectorizer(self,corpus, model, num_features):
        vocabulary = set(model.wv.index2word)
        if type(corpus[0])==list:
            features = [self.average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                            for tokenized_sentence in corpus]
        else:
            features=self.average_word_vectors(corpus, model, vocabulary, num_features)
        return np.array(features)
        
    # build word2vec model      
    def w2vModel(self):
        self.simple_model()
        # switch from simple model to comprehensive
        self.model = models.Word2Vec(self.texts,size=200, min_count=5)
        self.features = self.averaged_word_vectorizer(corpus=self.texts,
                                                 model=self.model,
                                                 num_features=200)           
    # build tfidf model
    def TfidfModel(self):
        self.simple_model()
        # switch from simple model to comprehensive
        self.model = models.TfidfModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]
        # Generate Similarity Matrix for TFIDF model
        self.index = similarities.MatrixSimilarity(self.corpus)

    # lsi model
    def LsiModel(self):
        self.simple_model()
        # switch from simple model to comprehensive
        self.model = models.LsiModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]
        # Generate Similarity Matrix for LSI model
        self.index = similarities.MatrixSimilarity(self.corpus)

    # lda model
    def LdaModel(self):
        self.simple_model()
        # switch from simple model to comprehensive
        self.model = models.LdaModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]
        # Generate Similarity Matrix for LDA model
        self.index = similarities.MatrixSimilarity(self.corpus)

    # preliminary steps for input sentences
    def sentence2vec(self, sentence):
        sentence = Sentence(sentence, self.seg).get_cuted_sentence()
        vec_bow = self.dictionary.doc2bow(sentence)
        return self.model[vec_bow]
    
    def bow2vec(self):
        vec = []
        length = max(self.dictionary) + 1
        for content in self.corpus:
            sentence_vectors = np.zeros(length)
            for co in content:
                sentence_vectors[co[0]] = co[1]  # 将句子出现的单词的tf-idf表示放入矩阵中
            vec.append(sentence_vectors)
        return vec

    # look for the most similar sentences
    # input: test sentence    
    def cosine_similarity(self,x,y):
        num = x.dot(y.T)
        denom = np.linalg.norm(x) * np.linalg.norm(y)
        return num / denom
    
    def similarity_k(self, sentence, k):        
        sentence_vec = self.sentence2vec(sentence)
        sims = self.index[sentence_vec]
        sim_k = sorted(enumerate(sims), key=lambda item: item[1], reverse=True)[:k]
        indexs = [i[0] for i in sim_k]
        scores = [i[1] for i in sim_k]
        return indexs, scores
    
    def similarity_v(self, sentence, k):       
        cuts=Sentence(sentence, self.seg).get_cuted_sentence()
        sentence_vec=self.averaged_word_vectorizer(corpus=cuts,
                                      model=self.model,
                                     num_features=200)
        d=[]
        for i in range(len(self.features)):
            score=self.cosine_similarity(self.features[i],sentence_vec)
            if score >=0 or score <=0:
                d.append([i,score]) 
        sim_k = sorted(d, key=lambda item: item[1], reverse=True)[:k]
        indexs = [i[0] for i in sim_k]
        scores = [i[1] for i in sim_k]
        return indexs, scores

In [32]:
def read_corpus(df,seg):
    qList = []
    # list of keywords in tweets
    qList_kw = []
    aList = []
    data = df[['text_y','text_x']]
    data_ls = np.array(data).tolist()
    for t in data_ls:
        qList.append(t[0])
        qList_kw.append(seg.cut(t[0]))
        aList.append(t[1])
    return qList_kw, qList, aList

# define function for frequency distribution plot
def plot_words(wordList):
    fDist = FreqDist(wordList)
    #print(fDist.most_common())
    print("Total number of words: ",fDist.N())
    print("Total number of distinct words: ",fDist.B())
    fDist.plot(10)

In [33]:
# Read data
df=pd.read_csv('sample.csv')

In [24]:
#preview data
df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0


In [25]:
#sort by companies responded the most
df[df.inbound==False].groupby('author_id').count().sort_values('text',ascending=False)[:10]

Unnamed: 0_level_0,tweet_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AppleSupport,13,13,13,13,3,13
SpotifyCares,8,8,8,8,6,8
Tesco,8,8,8,8,6,8
VirginTrains,4,4,4,4,3,4
British_Airways,3,3,3,3,1,3
Ask_Spectrum,1,1,1,1,1,1
ChaseSupport,1,1,1,1,0,1
HPSupport,1,1,1,1,0,1
O2,1,1,1,1,0,1
SouthwestAir,1,1,1,1,1,1


In [26]:
#selected the three most responsive companies
clist=df[df.inbound==False].groupby('author_id').count().sort_values('text',ascending=False)[:3].index.to_list()

In [27]:
clist

['AppleSupport', 'SpotifyCares', 'Tesco']

In [28]:
# join request and response into the same row: 'text_x' is the response from customer services; 'text_y' is the request
def pre(df,author):
    a=df[df.author_id==author]
    a=a.merge(df.loc[:,['tweet_id','text']],left_on='in_response_to_tweet_id',right_on='tweet_id')
    a=a[a.response_tweet_id.isnull()]    
    a['text_x']=a.text_x.apply(lambda x: x[:x.find('@')]+x[x.find('@')+8:])    
    return a

In [29]:
# a list of tweets responded by 'AmazonHelp', 'AppleSupport', and 'Uber_Support'
dataset=[]
for i in clist:
    dataset.append(pre(df,i))
dataset

[    tweet_id_x     author_id  inbound                      created_at  \
 0       119248  AppleSupport    False  Wed Oct 11 13:38:29 +0000 2017   
 1       119252  AppleSupport    False  Wed Oct 11 13:40:27 +0000 2017   
 2       119262  AppleSupport    False  Wed Oct 11 13:30:39 +0000 2017   
 3       119267  AppleSupport    False  Wed Oct 11 13:30:38 +0000 2017   
 4       119269  AppleSupport    False  Wed Oct 11 13:30:12 +0000 2017   
 6       119279  AppleSupport    False  Wed Oct 11 13:35:01 +0000 2017   
 8       119293  AppleSupport    False  Wed Oct 11 13:30:00 +0000 2017   
 9       119298  AppleSupport    False  Wed Oct 11 13:34:00 +0000 2017   
 10      119300  AppleSupport    False  Wed Oct 11 13:31:27 +0000 2017   
 11      119323  AppleSupport    False  Wed Oct 11 13:55:31 +0000 2017   
 
                                                text_x response_tweet_id  \
 0   We can help. Which version of iOS are you on? ...               NaN   
 1   Thanks for reaching out to 

In [40]:
while True:
    way=input('Select a model for encoding: w2v, tfidf, lsi, or lda? ')
    #Set up for specific model
    w2v=False
    if way == 'w2v':
        w2v=True
        print('Please wait for the system to set up.')
        time1=time.time()
        seg0 = Seg()
        List_kw0, questionList0, answerList0 = read_corpus(dataset[0],seg0)
        ss0 = SentenceSimilarity(seg0,clist[0])
        ss0.set_sentences(questionList0)
        ss0.w2vModel()
#         ss0.TfidfModel() 
        #     ss.LsiModel()
        #     ss.LdaModel()        
        seg1 = Seg()
        List_kw1, questionList1, answerList1 = read_corpus(dataset[1],seg1)
        ss1 = SentenceSimilarity(seg1,clist[1])
        ss1.set_sentences(questionList1)
        ss1.w2vModel()
#         ss1.TfidfModel() 
        seg2 = Seg()
        List_kw2, questionList2, answerList2 = read_corpus(dataset[2],seg2)
        ss2 = SentenceSimilarity(seg2,clist[2])
        ss2.set_sentences(questionList2)
        ss2.w2vModel()
#         ss2.TfidfModel() 
        time2=time.time()
        print('The setup is now complete, which took {} s.'.format(time2-time1))
        print('---------------------------------------------------------------------------------------------------')
        
        #Start chatting with customer
        print('Hey there! This is Auto Customer Service. First, please choose the company you would like to chat with:')
        while True:
            company=input("0 for Amazon, 1 for Apple, 2 for Uber, 'q' to quit: ")
            if company in ['0','1','2']:
                while True:
                    question = input("Please type your question here or press 'q' to quit: ")
                    if question == 'q':
                        break
                    time1 = time.time()

                    # chats for Amazon
                    if company=='0':
                        question_k = ss0.similarity_v(question, 5)
                        time2 = time.time()
                        for i in range(5):
                            if question_k[1][i]>=0.5:
                                print('Thanks for asking, here is the',i+1,'most likely answer(s) from AmazonHelp:')
                                print(answerList0[question_k[0][i]])
                                while True:
                                    solved=input('Is your problem solved? (input y/n)')
                                    if solved=='n' or solved=='y':
                                        print()
                                        break
                                    elif solved!='y':
                                        input('Please enter y/n')
                                        continue
                                if solved=='y':
                                    print('Thank you for using Auto Customer Service. It\'s my pleasure to help.')
                                    break

                            else:
                                if i == 0:
                                    print('Unfortunately, I can\'t find any answer in our database system, please contact human services.' )
                                else:
                                    print('Unfortunately, I can\'t find more answers in the system, please contact human services if you still have questions.')
                                break


                        for idx, score in zip(*question_k):
                            print("Similar questions： {},                score： {}".format(questionList0[idx], score))

                        cost = time2 - time1
                        print('It took {} s to look for the answers'.format(cost))
                        print('---------------------------------------------------------------------------------------------------')
                        print('Is there anything else I can help?')# chats for apple
                    elif company=='1':
                        question_k = ss1.similarity_v(question, 5)
                        time2 = time.time()
                        for i in range(5):
                            if question_k[1][i]>=0.5:
                                print('Thanks for asking, here is the',i+1,'most likely answer from AppleSupport')
                                print(answerList1[question_k[0][i]])
                                while True:
                                    solved=input('Is this problem solved? (input y/n)')
                                    if solved=='n' or solved=='y':
                                        print()
                                        break
                                    elif solved!='y':
                                        input('Please enter y/n')
                                        continue
                                if solved=='y':
                                    print('Thank you for using Auto Customer Service. It\'s my pleasure to solve your problem.')
                                    break

                            else:
                                if i == 0:
                                    print('Unfortunately, I can\'t find any answer in our database, please contact human services.' )
                                else:
                                    print('Unfortunately, I can\'t find more answers in our database, please contact human services if you still have questions.')
                                break


                        for idx, score in zip(*question_k):
                            print("Similar questions： {},                score： {}".format(questionList1[idx], score))

                        cost = time2 - time1
                        print('It took {} s to look for the answers'.format(cost))
                        print('---------------------------------------------------------------------------------------------------')
                        print('Is there anything else I can help?')

                    # chats for Uber
                    else:
                        question_k = ss2.similarity_v(question, 5)
                        time2 = time.time()
                        for i in range(5):
                            if question_k[1][i]>=0.5:
                                print('Thanks for asking, here is the',i+1,'most likely answer(s) from Uber_Support')
                                print(answerList2[question_k[0][i]])
                                while True:
                                    solved=input('Is this problem solved? (input y/n)')
                                    if solved=='n' or solved=='y':
                                        print()
                                        break
                                    elif solved!='y':
                                        input('Please enter y/n')
                                        continue
                                if solved=='y':
                                    print('Thank you for using Auto Customer Service. It\'s my pleasure to solve your problem.')
                                    break

                            else:
                                if i == 0:
                                    print('Unfortunately, I can\'t find any answer in our database, please contact human services.' )
                                else:
                                    print('Unfortunately, I can\'t find more answers in our database, please contact human services if you still have questions.')
                                break

                        for idx, score in zip(*question_k):
                            print("Similar questions： {},                score： {}".format(questionList2[idx], score))

                        cost = time2 - time1
                        print('It took {} s to look for the answers'.format(cost))
                        print('---------------------------------------------------------------------------------------------------')
                        print('Is there anything else I can help?')
                print('Thank you for asking. Would you like to ask questions about other companies?')
                
                    
            elif company=='q':
                print('Thank you. Say safe and have a good one!')
                break
            else:
                print('Please input 0, 1, 2 or q')
            
        break
            
            
        
    elif way == 'tfidf':
        print('Please wait for system to set up')
        time1=time.time()
        seg0 = Seg()
        List_kw0, questionList0, answerList0 = read_corpus(dataset[0],seg0)
        ss0 = SentenceSimilarity(seg0,clist[0])
        ss0.set_sentences(questionList0)
        #ss0.w2vModel()
        ss0.TfidfModel() 
        #     ss.LsiModel()
        #     ss.LdaModel()        
        seg1 = Seg()
        List_kw1, questionList1, answerList1 = read_corpus(dataset[1],seg1)
        ss1 = SentenceSimilarity(seg1,clist[1])
        ss1.set_sentences(questionList1)
#         ss1.w2vModel()
        ss1.TfidfModel() 
        seg2 = Seg()
        List_kw2, questionList2, answerList2 = read_corpus(dataset[2],seg2)
        ss2 = SentenceSimilarity(seg2,clist[2])
        ss2.set_sentences(questionList2)
#         ss2.w2vModel
        ss2.TfidfModel() 
        time2=time.time()
        print('Finished! Time cost for setting up: {} s'.format(time2-time1))
        print('---------------------------------------------------------------------------------------------------')
        break
        
    elif way == 'lsi':
        print('Please wait for system to set up')
        time1=time.time()
        seg0 = Seg()
        List_kw0, questionList0, answerList0 = read_corpus(dataset[0],seg0)
        ss0 = SentenceSimilarity(seg0,clist[0])
        ss0.set_sentences(questionList0)
        #ss0.w2vModel()
#         ss0.TfidfModel() 
        ss0.LsiModel()
        #     ss.LdaModel()        
        seg1 = Seg()
        List_kw1, questionList1, answerList1 = read_corpus(dataset[1],seg1)
        ss1 = SentenceSimilarity(seg1,clist[1])
        ss1.set_sentences(questionList1)
#         ss1.w2vModel()
        ss1.LsiModel() 
        seg2 = Seg()
        List_kw2, questionList2, answerList2 = read_corpus(dataset[2],seg2)
        ss2 = SentenceSimilarity(seg2,clist[2])
        ss2.set_sentences(questionList2)
#         ss2.w2vModel
        ss2.LsiModel() 
        time2=time.time()
        print('Finished! Time cost for setting up: {} s'.format(time2-time1))
        print('---------------------------------------------------------------------------------------------------')
        break
        
    elif way == 'lda':
        print('Please wait for system to set up')
        time1=time.time()
        seg0 = Seg()
        List_kw0, questionList0, answerList0 = read_corpus(df,seg0)
        ss0 = SentenceSimilarity(seg0,clist[0])
        ss0.set_sentences(questionList0)
        #ss0.w2vModel()
#         ss0.TfidfModel() 
        #     ss.LsiModel()
        ss0.LdaModel()        
        seg1 = Seg()
        List_kw1, questionList1, answerList1 = read_corpus(dataset[1],seg1)
        ss1 = SentenceSimilarity(seg1,clist[1])
        ss1.set_sentences(questionList1)
#         ss1.w2vModel()
        ss1.LdaModel() 
        seg2 = Seg()
        List_kw2, questionList2, answerList2 = read_corpus(dataset[2],seg2)
        ss2 = SentenceSimilarity(seg2,clist[2])
        ss2.set_sentences(questionList2)
#         ss2.w2vModel
        ss2.LdaModel() 
        time2=time.time()
        print('Finished! Time cost for setting up: {} s'.format(time2-time1))
        print('---------------------------------------------------------------------------------------------------')
        break
    else:
        print('Please input right')
        continue
        
if w2v==False:
    print('Hey there! This is Auto Customer Service. First please choose the company:')
    while True:
        company=input('0 for Amazon, 1 for Apple, 2 for Uber: ')
        if company in ['0','1','2']:
            while True:        
                question = input("Please type your question here ('q' to quit): ")
                if question == 'q':
                    break
                time1 = time.time()
                if company=='0':           
                    question_k = ss0.similarity_k(question, 5)
                    time2 = time.time()
                    for i in range(5):
                        if question_k[1][i]>=0.5:
                            print('Thanks for asking, here is the',i+1,'most likely answer(s) from AmazonHelp')
                            print(answerList0[question_k[0][i]])
                            while True:
                                solved=input('Is this problem solved? (input y/n)')
                                if solved=='n' or solved=='y':
                                    print()
                                    break
                                elif solved!='y':
                                    input('Please enter y/n')
                                    continue
                            if solved=='y':
                                print('Thank you for using Auto Customer Service. It\'s my pleasure to solve your problem.')
                                break                    
                        else:
                            if i == 0:
                                print('Unfortunately, I can\'t find any answer in our database, please contact human services.' )
                            else:
                                print('Unfortunately, I can\'t find more answers in our database, please contact human services if you still have questions.')
                            break


                    for idx, score in zip(*question_k):
                        print("Similar questions： {},                score： {}".format(questionList0[idx], score))

                    cost = time2 - time1
                    print('It took {} s to look for the answers'.format(cost))
                    print('---------------------------------------------------------------------------------------------------')
                    print('What else can I help')
                elif company=='1':
                    question_k = ss1.similarity_k(question, 5)
                    time2 = time.time()
                    for i in range(5):
                        if question_k[1][i]>=0.5:
                            print('Thanks for asking, here is the',i+1,'most likely answer(s) from AppleSupport')
                            print(answerList1[question_k[0][i]])
                            while True:
                                solved=input('Is this problem solved? (input y/n)')
                                if solved=='n' or solved=='y':
                                    print()
                                    break
                                elif solved!='y':
                                    input('Please enter y/n')
                                    continue
                            if solved=='y':
                                print('Thank you for using Auto Customer Service. It\'s my pleasure to solve your problem.')
                                break

                        else:
                            if i == 0:
                                print('Unfortunately, I can\'t find any answer in our database, please contact human services.' )
                            else:
                                print('Unfortunately, I can\'t find more answers in our database, please contact human services if you still have questions.')
                            break


                    for idx, score in zip(*question_k):
                        print("Similar questions： {},                score： {}".format(questionList1[idx], score))

                    cost = time2 - time1
                    print('It took {} s to look for the answers'.format(cost))
                    print('---------------------------------------------------------------------------------------------------')
                    print('Is there anything else I can help?')
                else:
                    question_k = ss2.similarity_k(question, 5)
                    time2 = time.time()
                    for i in range(5):
                        if question_k[1][i]>=0.5:
                            print('Thanks for asking, here is the',i+1,'most likely answer(s) from Uber_Support')
                            print(answerList2[question_k[0][i]])
                            while True:
                                solved=input('Is this problem solved? (input y/n)')
                                if solved=='n' or solved=='y':
                                    print()
                                    break
                                elif solved!='y':
                                    input('Please enter y/n')
                                    continue
                            if solved=='y':
                                print('Thank you for using Auto Customer Service. It\'s my pleasure to solve your problem.')
                                break

                        else:
                            if i == 0:
                                print('Unfortunately, I can\'t find any answer in the systen, please contact human services.' )
                            else:
                                print('Unfortunately, I can\'t find more answers in the systen, please contact human services if you still have questions.')
                            break


                    for idx, score in zip(*question_k):
                        print("Similar questions： {},                score： {}".format(questionList2[idx], score))

                    cost = time2 - time1
                    print('It took {} s to look for the answers?'.format(cost))
                    print('---------------------------------------------------------------------------------------------------')
                    print('Is there anything else I can help')
            
            print('Thank you for asking. Do you want to ask questions about other companies?')
                
                    
        elif company=='q':
            print('Thank you. Stay safe and have a good one!')
            break
        else:
            print('Please enter 0, 1, 2 or q')

Select a model for encoding: w2v, tfidf, lsi, or lda? lda
Please wait for system to set up


KeyError: "None of [Index(['text_y', 'text_x'], dtype='object')] are in the [columns]"