&#x1F534;
**Author** : Manish Bhandari (Smart Dubai Govt.)

**Project** : Extract Log

**Crated on** : 22nd-Mar-2018



----

In [1]:
from cloudant.client import Cloudant
from cloudant.error import CloudantException
from cloudant.result import Result, ResultByKey
from scipy.stats import stats
import cloudant as cld
from dateutil import tz
import datetime
import time
import json
import csv
import pandas as pd
import numpy as np
from stemming.porter2 import stem
from nltk.corpus import stopwords

#### &#x1F534; Variables Setup

In [2]:
environment = 2  # UAT: 1, Dev : 0
language = 'EN' 
fromDateTime = datetime.datetime(2018,1,1,0,0,0) # time in GMT  Format YYYY, MM , DD , HH,MM, SS
toDateTime   = datetime.datetime(2018,2,28,23,59,0)
output_file =  'RASHID_LOG_'+language+'_'+ str(fromDateTime.strftime("%Y%m%d%H%M"))+'_' + str(toDateTime.strftime("%Y%m%d%H%M")) +'.csv'

#### &#x1F534; Cloudant DB Details

In [3]:
# The code was removed by DSX for sharing.

#### &#x1F534;Convert Date to UNIX Date format

In [4]:
epoch = datetime.datetime(1970,1,1)
from_datetime = (fromDateTime - epoch).total_seconds()*1000
to_datetime = (toDateTime - epoch).total_seconds()*1000

In [5]:
from_zone = tz.gettz('UTC')
to_zone = tz.gettz('Asia/Dubai')
def convertToDateTime(df , colnames):  # col name must be provided as a list
    for col in colnames:
        df[col] = pd.to_datetime(df[col])
    return df

def convertUtcToLocal(df , colnames):
    for col in colnames:
        df[col]=df[col].dt.tz_localize('utc').dt.tz_convert(to_zone).astype(str)
        df[col]= df[col].map(lambda x: x.replace('+04:00','0')).map(lambda x: x[0:19])
        df[col]=  pd.to_datetime(df[col])
    return df    

#### &#x1F534;Connect and extract data

In [6]:
client = Cloudant(cloudant_db_credential['serviceUsername'], cloudant_db_credential['servicePassword'], url=cloudant_db_credential['serviceURL'])
client.connect()

# Create an instance of the database.
myDatabase = client[cloudant_db]   # client.create_database(databaseName)
if myDatabase.exists:
    rows = []
    result_collection = Result(myDatabase.all_docs, include_docs=True)
    query = cld.query.Query(myDatabase, sort =[{"_id": "asc"}],
        selector= {"$and": [ {"requestTime": {"$gte": from_datetime,"$lte": to_datetime}},{"language": language}]},
        fields= ["language","conversationId","request.workspace_id","request.requestTime"
                ,"response.intents", "response.input.text","response.output.text","response.output.nodes_visited"]
        )
    for doc in query()['docs']:   # for doc in query(limit=100, skip=100)['docs']:
        row = {}
        try:
            row['Language'] = doc['language']
            row['conversation_id'] = doc['conversationId']
            row['workspace_id'] =  doc['request']['workspace_id']
            
            #a= doc['request']['requestTime']
            row['request TS'] = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(doc['request']['requestTime']/1000.0))
            
            
            
            response = doc['response']
            
            if len(response['intents']) > 0:
                row['Confidence'] = response['intents'][0]['confidence']
                row['Intent'] = response['intents'][0]['intent']
                
             
            if 'text' in response['input']: 
                row['User Input'] = response['input']['text']
                
            for outtext in response['output']['text']:
                  if(len(outtext)>0):
                    row['Output'] = outtext
            
            for nodes_visit in response['output']['nodes_visited']:
                  if(len(nodes_visit)>0):
                    row['nodes_visit'] = nodes_visit                    
            
      
            
            #if 'text' in response['output']:row['Output'] = ' '.join(str(v) for v in response['output']['text'])


            rows.append(row)
        except Exception as e: 
             pass
  
    log_df = pd.DataFrame(rows,columns=['Language','workspace_id','conversation_id','request TS','User Input','Output','nodes_visit','Intent','Confidence'])
    log_df= log_df[log_df['User Input'] != 'start']
    #log_df.dropna(subset=['User Input'], how='all')
    log_df = convertToDateTime(log_df , ['request TS'])
    log_df = convertUtcToLocal(log_df , ['request TS']) 
    log_df = log_df.sort_values(['conversation_id', 'request TS'], ascending=[False, False])
    print ("The dataset has {} samples with {} features each.".format(*log_df.shape))
client.disconnect()

The dataset has 117663 samples with 9 features each.


#### &#x1F534; Filter out unwanted data

In [7]:
searchfor = ['!!', 'select top']
chitchat_ws =['f8877088-a64d-479d-938e-4d22f1c19217','418dbe94-9735-4272-a18d-237bc843c570']
log_df = log_df[(log_df['User Input'].str.len() >0)  ] # & (log_df['Output'].str.len() >0)
log_df['User input stem'] = log_df['User Input'].str.lower().apply(lambda sentence :" ".join([stem(word) for word in sentence.split(" ")])).str.strip()
log_df = log_df.drop_duplicates(subset=['User input stem','Output','Confidence','Intent'], keep='first') # In case there was loop created then naive approach to creak them
log_df = log_df[(~log_df['User Input'].str.contains('|'.join(searchfor),na=False))]
log_df['relevance'] = np.where(log_df['nodes_visit'].isin(['Anything else']),'Not Trained Input',
                               np.where((log_df['Intent'].str.len() >0) & (~ log_df['workspace_id'].isin(chitchat_ws)) ,'Trained Input',
                                np.where((log_df['Intent'].str.len() >0) & (log_df['workspace_id'].isin(chitchat_ws)) ,'ChitChat Input','Confirmation Input')))

log_df['request TS'] =  pd.to_datetime(log_df['request TS'] )
log_df= log_df.sort_values(by=['conversation_id','request TS'], ascending=[True,True])
print ("The dataset has {} samples with {} features each.".format(*log_df.shape))

The dataset has 11536 samples with 11 features each.


#### &#x1F534; Calculate conversation duration

In [8]:
aggr_func ={'request TS' : ['min', 'max'] }
duration_df = log_df.copy()
duration_df = duration_df.groupby('conversation_id').agg(aggr_func).reset_index()
duration_df['conversation_duration'] = (duration_df['request TS']['max'] - duration_df['request TS']['min']).astype('timedelta64[s]')
duration_df.columns = ["conversation_id","start_datetime","end_datetime",'conversation_duration']
duration_df['request TS'] =duration_df['start_datetime']
log_df = pd.merge(log_df, duration_df, how='left', on=['conversation_id', 'request TS'])
#stats.describe(log_df['conversation_duration'])

#### &#x1F534; Split Timestamp into Date and Time

In [9]:
log_df['conversation_date'] = log_df['request TS'].dt.date
log_df['conversation_time'] = log_df['request TS'].dt.time
log_df['conversation_year'] = log_df['request TS'].dt.year
log_df['conversation_month'] = log_df['request TS'].dt.month
log_df['conversation_week'] = log_df['request TS'].dt.weekday_name
log_df['conversation_day'] = log_df['request TS'].dt.day
log_df['conversation_hours'] = log_df['request TS'].dt.hour

#### &#x1F534;Sore data to IBM Cloud Object Storage(COS)

IBM Cloud Object Storage(COS) provides flexible storage solution to the user and it can be accessed over HTTP using a REST API. In this notebook, we will learn how to access IBM Cloud Object Storage in python.


https://www.ibm.com/cloud-computing/bluemix/node/4481


In Watson Studio, we use project to organize resources like data, notebooks, models & connections. To easily interact with these assets now we have project-lib along with object storage APIs. Project-lib is programmatic interface to interact with your data stored in object storage. It allows you to easily access all your project assets including files, connections and metadata.

In [10]:
# The code was removed by DSX for sharing.

#### &#x1F534; Upload Data to Object Storage

In [11]:
#project.save_data(data=log_df.to_csv(index=False),file_name=output_file,overwrite=True)
#log_df = pd.read_csv(project.get_file(output_file))
