In [865]:
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
from collections import Counter
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [866]:
path = '/Users/stevenfelix/Dropbox/DataScience/Projects/PsychologyToday/data/'
data = pd.read_csv(path+'psychologytoday2017-02-17.txt', sep = '\t')

In [867]:
data.reset_index
del data['Unnamed: 0']
del data['sexualitynum'] # few therapists include this
del data['specialtiesnum'] # PT limits to 3 specialities, so most everyone lists 3
del data['file']

## Get rid of treatment facilities

In [868]:
data = data[data.title != 'treatment facility']
data.reset_index(inplace = True)
del data['index']

In [869]:
exclusions = """Group, Associates, Professionals, Boston, Center, New England, Recovery, 
                Massachusetts, Treatment, Network, Newton, Brighton, Arlington, Watertown, 
                Coaching, Consulting, Cambridge, Somerville, Lexington, Waltham""".split(', ')
exclusions = [string.replace('\n', '').replace(' ', '') for string in exclusions]
to_remove = [any(exclusion in title for exclusion in exclusions) for title in data.name]

In [870]:
to_keep = [not b for b in to_remove]

In [871]:
data = data[to_keep]
data.set_index('name', inplace = True)
del data.index.name
data.head()

Unnamed: 0,title,degrees,city,profile,years,school,statelicense,graduated,fee,insurance,specialties,issues,issuesnum,mentalhealth,mentalhealthnum,sexuality,treatmentorientation,treatmentorientationnum,url
Kevin Ketchum,counselor,"ms, ma",Brookline,Are you bright and intelligent but struggle wi...,10.0,Boston Graduate School of Psychoanalysis,,2012.0,60.0,,"Anxiety, Depression, Addiction","ADHD, Bipolar Disorder, Developmental Disorder...",11.0,"Dissociative Disorders, Impulse Control Disord...",6.0,"Bisexual, Gay, Lesbian","Family / Marital, Psychoanalytic, Psychodynamic",3.0,https://therapists.psychologytoday.com/rms/pro...
Kaynaz Mehta,counselor,"ma, lmhc",Reading,Therapy is about understanding you. Your stren...,9.0,Boston College,7463 Massachusetts,2008.0,110.0,Yes,"Women, Parenting, Anxiety","Addiction, Alcohol Abuse, Career Counseling, C...",20.0,"Mood Disorders, Thinking Disorders",2.0,,"Culturally Sensitive, Eclectic, Emotionally Fo...",11.0,https://therapists.psychologytoday.com/rms/pro...
Jan Bergstrom,counselor,lmhc,Arlington,My practice with couples and individuals is no...,20.0,Cambridge College,4548 Massachusetts,1995.0,165.0,,"Relationship Issues, Trauma and PTSD, Parenting","Behavioral Issues, Depression, Divorce, Dual D...",9.0,"Mood Disorders, Thinking Disorders",2.0,"Gay, Lesbian","Co-Dependence Tx, Cognitive Behavioral (CBT), ...",12.0,https://therapists.psychologytoday.com/rms/pro...
Peter Guthrie,clinical social work/therapist,licsw,Belmont,Few of us get through life without hitting a b...,20.0,Simmons College School of Social Work,1026571 Massachusetts,1993.0,135.0,Yes,"College/Graduate Students, Relationship Issues...","Anxiety, Coping Skills, Depression, Life Coach...",7.0,Mood Disorders,1.0,,"Cognitive Behavioral (CBT), Eclectic, Psychody...",5.0,https://therapists.psychologytoday.com/rms/pro...
Felix Treitler,clinical social work/therapist,"licsw, usta, uspta",Arlington,I make HOME VISITS to clients who cannot trave...,20.0,Simmons School of Social Work,1024337 Massachusetts,1992.0,100.0,Yes,"Depression, Substance Abuse, Life Coaching","12-Step Programs, ADHD, Addiction, Alcohol Abu...",50.0,"Elderly Persons Disorders, Impulse Control Dis...",5.0,"Bisexual, Gay, Lesbian","Activities Combined with Therapy, Coaching, Co...",23.0,https://therapists.psychologytoday.com/rms/pro...


## Pickle data

In [889]:
data.to_pickle(path+'psychologytoday.pkl')

## Expanding out list variables: issues, treatment orientations, mental health

In [872]:
""" 
one_list takes in a pandas df and variable name, and returns a dictionary containing information about the series:

counts = a Counter object (ie a dictionary) giving the number of providers endorsing each service/specialty
prov_lists = a dictionary mapping providers to a list of their services/specialties

"""
def series_info(df, varname):
    dic = {}
    df[varname].fillna('None', inplace = True)
    list_of_lists = [list_str.lower().split(", ") for list_str in df[varname]]
    dic['prov_lists'] = dict(zip(data.index, list_of_lists))
    onelist = []
    [onelist.extend(l) for l in list_of_lists];
    dic['counts'] = Counter(onelist)
    return dic

In [873]:
def decompose(df, varlist):
    dic = {}
    for var in varlist:
        dic[var] = series_info(df, var)
    return dic

In [874]:
names = ['issues', 'specialties', 'treatmentorientation', 'mentalhealth']
varinfo = decompose(data, names)

In [None]:
import json
fd = open(path+"profiledict.json","w")
json.dump(varinfo, fd)
fd.close()

"profiledict.json" will contain be a dictionary of dictionaries: each one contains "counts" of each response option and "prov_lists" (a dict of each provider and a list of their responses)

### Below we will create a dictionary of DataFrames. Each df will consist of a column for each response option, and the rows will correspond to each provider. Values 0/1 refer to whether or not a provider endorsed a response option for that category (e.g., "cbt" for "treatment orientations")

In [877]:
def booldfs(variables):
    dic = {}
    for variable in variables:
        common = [k for k in varinfo[variable]['counts'] if varinfo[variable]['counts'][k] > 10]
        df = pd.DataFrame(columns=common, index = varinfo[variable]['prov_lists'].keys())
        df.fillna(0, inplace = True)
        for name in df.index:
            for item in varinfo[variable]['prov_lists'][name]:
                if item in df.columns:            # need to make sure we don't just create a new column
                    df.loc[name,item] = 1
                else: continue
        dic[variable] = df
    return dic

In [878]:
TFdfs = booldfs(['issues','specialties','treatmentorientation'])

In [880]:
TFdfs['issues'].head()

Unnamed: 0,weight loss,family conflict,chronic pain,life transitions,obsessive-compulsive (ocd),obesity,depression,bipolar disorder,men,coping skills,sexual abuse,grief,women,none,testing and evaluation,dual diagnosis,substance abuse,relationship issues,stress,addiction,emotional disturbance,alcohol abuse,racial identity,teen violence,trauma and ptsd,school issues,medication management,alzheimer,divorce,self esteem,eating disorders,sports performance,anxiety,domestic abuse,chronic relapse,learning disabilities,adhd,domestic violence,parenting,internet addiction,spirituality,child or adolescent,chronic impulsivity,anger management,video game addiction,narcissistic personality,sexual addiction,oppositional defiance,career counseling,suicidal ideation,sleep or insomnia,pregnancy,intellectual disability,sex therapy,self-harming,antisocial personality,drug abuse,chronic illness,peer relationships,infertility,adoption,traumatic brain injury,behavioral issues,asperger,developmental disorders,codependency,borderline personality,infidelity,marital and premarital,life coaching,autism,gambling,transgender
Stanley M Cole,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Greg Kaufman,1,1,0,0,0,0,1,1,0,1,1,1,0,0,0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,0,1,0,0,0,1,0,0,1,1,1,0,1,0,1,1
Linda Sacks,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Susan Stahl,0,1,0,0,1,1,0,1,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0
Tal Astrachan,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [881]:
TFdicts = {key:TFdfs[key].to_dict() for key in TFdfs}

In [883]:
with open(path+"profilefeaturesdict_bool_dict.json","w") as fd:
    json.dump(TFdicts, fd)

"profilefeatures_bool_dict.json" will contain the dictionary for each of these dataframes