In [41]:
import pandas as pd
import numpy as np
from ast import literal_eval
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from datetime import datetime

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\samyj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [42]:
docs = ['ayurveda', 'cardiologist', 'dentist', 'dermatologist', 'ent', 'gastroentreologist', 'gyanecologist-obstetrecian', 'homoeopath', 'implantologist', 'neurologist', 'ophthalmologist', 'pediatric-dentist', 'urologist']
specialization_mapping = {docs[i]:i for i in range(len(docs))}

In [43]:
ayurveda_data = pd.read_csv("../Data_final/ayurveda_new.csv", converters={'feedbacks': literal_eval})
cardiologist_data = pd.read_csv("../Data_final/cardiologist_new.csv", converters={'feedbacks': literal_eval})
dentist_data = pd.read_csv("../Data_final/dentist_new.csv", converters={'feedbacks': literal_eval})
dermatologist_data = pd.read_csv("../Data_final/dermatologist_new.csv", converters={'feedbacks': literal_eval})
ent_data = pd.read_csv("../Data_final/ent_new.csv", converters={'feedbacks': literal_eval})
gastroentreologist_data = pd.read_csv("../Data_final/gastroenterologist_new.csv", converters={'feedbacks': literal_eval})
gyanecologist_obstetrecian_data = pd.read_csv("../Data_final/gynecologist-obstetrician_new.csv", converters={'feedbacks': literal_eval})
homoeopath_data = pd.read_csv("../Data_final/homoeopath_new.csv", converters={'feedbacks': literal_eval})
implantologist_data = pd.read_csv("../Data_final/implantologist_new.csv", converters={'feedbacks': literal_eval})
neurologist_data = pd.read_csv("../Data_final/neurologist_new.csv", converters={'feedbacks': literal_eval})
ophthalmologist_data = pd.read_csv("../Data_final/ophthalmologist_new.csv", converters={'feedbacks': literal_eval})
pediatric_dentist_data = pd.read_csv("../Data_final/pediatric-dentist_new.csv", converters={'feedbacks': literal_eval})
urologist_data = pd.read_csv("../Data_final/urologist_new.csv", converters={'feedbacks': literal_eval})

domain_to_df = {
    'ayurveda': ayurveda_data,
    'cardiologist': cardiologist_data,
    'dentist': dentist_data,
    'dermatologist': dermatologist_data,
    'ent': ent_data,
    'gastroentreologist': gastroentreologist_data,
    'gyanecologist-obstetrecian': gyanecologist_obstetrecian_data,
    'homoeopath': homoeopath_data,
    'implantologist': implantologist_data,
    'neurologist': neurologist_data,
    'ophthalmologist': ophthalmologist_data,
    'pediatric-dentist': pediatric_dentist_data,
    'urologist': urologist_data
}

domain_to_cleaned_df = {}

In [44]:
def convert_fees(fee_str):
    if(fee_str[0] == '₹'):
        return fee_str[1:]

def extract_exp(sp_cd):
    sub1 = "\r"
    sub2 = "Years"
    idx1 = sp_cd.index(sub1)
    idx2 = sp_cd.index(sub2)
    res = ''
    # getting elements in between
    for idx in range(idx1 + len(sub1) + 1, idx2):
        res = res + sp_cd[idx]
    # print(sp_cd[:idx1])
    # print("hey")
    return int(float(res))

def calculate_num_days(timing_days):
    intervals = timing_days.split(',')
    num_days = 0
    for interval in intervals:
        num_days += interval.count('-') + 1
    return num_days

def calculate_num_hours(timing_session):
    intervals = timing_session.split('\r\n')
    format = '%H:%M %p'
    num_hours = 0
    for interval in intervals:
        l = interval.split(' - ')
        start = datetime.strptime(l[0], format)
        end = datetime.strptime(l[1], format)
        duration = (end - start).total_seconds() / 3600.0
        if(duration < 0):
            duration = duration + 12
        num_hours += duration
    return num_hours

def clean_review_score(review_score):
    elements = review_score.split('% ')
    score = int(elements[0]) / 100
    votes_idx = elements[1].index('votes')
    num_votes = int(elements[1][1:votes_idx])
    return score, num_votes

def clean_feedbacks(feedbacks):
    overall_score_index_doc_list = []
    overall_reviews_score_index_list = []
    for reviews in feedbacks:
        # print(reviews)
        scores_index_list = []
        for review in reviews:
            sid = SentimentIntensityAnalyzer()
            scores_dict = sid.polarity_scores(review)
            scores = [scores_dict['neg'], scores_dict['pos']]
            score_index = scores.index(max(scores))
            scores_index_list.append(score_index)
        max_score_index = max(set(scores_index_list), key = scores_index_list.count)
        overall_reviews_score_index_list.append(scores_index_list)
        overall_score_index_doc_list.append(max_score_index)
    return overall_reviews_score_index_list, overall_score_index_doc_list

def clean_domain_df(df, domain_num):
    
    rows_to_drop = []
    for i in range(len(df)):
        if(df.iloc[i]['feedbacks'] == []):
            rows_to_drop.append(i)
        
    df.drop(rows_to_drop, axis=0, inplace=True)
    
    for column in list(df.columns):
        if(column == 'current_url' or column == 'feedbacks'):
            pass
        else:
            df[column].fillna(df[column].mode()[0], inplace=True)
    
    
    
    
    df['consultation_fee'] = df['consultation_fee'].apply(lambda x: convert_fees(x))
    df['num_days'] = df['timing_days'].apply(lambda x: calculate_num_days(x))
    df['num_hours'] = df['timing_session'].apply(lambda x: calculate_num_hours(x))
    
    scores_list = []
    num_votes_list = []
    for i in range(len(df)):
        doc_score, doc_votes = clean_review_score(df.iloc[i]['review_score'])
        scores_list.append(doc_score)
        num_votes_list.append(doc_votes)
    df['score'] = scores_list
    df['num_votes'] = num_votes_list
    
    exp_list = []
    specialization = df['specialization']
    for sp in specialization:
        exp_val = extract_exp(sp)
        exp_list.append(exp_val)
    df['years_experience'] = exp_list
    df['domain'] = [domain_num for i in range(len(df))]
    
    # print(df['feedbacks'])
    overall_reviews_score_index_list, overall_score_index_doc_list = clean_feedbacks(list(df['feedbacks']))
    df['reviews_score_index'] = overall_reviews_score_index_list
    df['doc_review_score_index'] = overall_score_index_doc_list
    
    print(f"Final Columns : {df.columns}")
    
    df.drop(['specialization', 'timing_days', 'timing_session', 'review_score', 'Unnamed: 0'], axis=1, inplace=True)
    
    return df

In [45]:
domain_num = 1
for domain in docs:
    cleaned_df = clean_domain_df(domain_to_df[domain], domain_num)
    domain_num += 1
    domain_to_cleaned_df[domain] = cleaned_df

Final Columns : Index(['Unnamed: 0', 'name', 'qualification', 'specialization',
       'clinic_address', 'timing_days', 'timing_session', 'review_score',
       'consultation_fee', 'current_url', 'feedbacks', 'num_days', 'num_hours',
       'score', 'num_votes', 'years_experience', 'domain',
       'reviews_score_index', 'doc_review_score_index'],
      dtype='object')
Final Columns : Index(['Unnamed: 0', 'name', 'qualification', 'specialization',
       'clinic_address', 'timing_days', 'timing_session', 'review_score',
       'consultation_fee', 'current_url', 'feedbacks', 'num_days', 'num_hours',
       'score', 'num_votes', 'years_experience', 'domain',
       'reviews_score_index', 'doc_review_score_index'],
      dtype='object')
Final Columns : Index(['Unnamed: 0', 'name', 'qualification', 'specialization',
       'clinic_address', 'timing_days', 'timing_session', 'review_score',
       'consultation_fee', 'current_url', 'feedbacks', 'num_days', 'num_hours',
       'score', 'num_v

In [46]:
final_data = pd.concat([domain_to_cleaned_df[dom] for dom in docs], axis=0)

In [47]:
final_data.columns

Index(['name', 'qualification', 'clinic_address', 'consultation_fee',
       'current_url', 'feedbacks', 'num_days', 'num_hours', 'score',
       'num_votes', 'years_experience', 'domain', 'reviews_score_index',
       'doc_review_score_index'],
      dtype='object')

In [48]:
domain_dummies = pd.get_dummies(final_data.domain)
final_data = pd.concat([final_data, domain_dummies], axis=1)

In [58]:
for dom in docs:
    domain_to_cleaned_df[dom].to_csv(f"../Data_final/cleaned_data/domain_wise/{dom}.csv", index=False)

In [59]:
final_data.to_csv('../Data_final/cleaned_data/final_data.csv', index=False)