UCSanDiegoX: DSE200x Python for Data Science

Rustan, 7/29/2018

# Final Project: Predictors of attitudes towards opioids _(WORKING)_

## Objective: Explore survey responses focusing on the relationship between demographics and attitudes towards opioids.

Method: Conduct k-means cluster analysis to determine whether there are clusters of demographic and experiential variables that predict attitudes towards opioids, including urban or rural geography.

Dataset source: 2018 national survey (proprietary), 2010 US Decennial Census https://factfinder.census.gov/faces/nav/jsf/pages/searchresults.xhtml?refresh=t 

Import libraries: pandas, numpy, matplotlib.pyplot, sklearn.preprocessing, sklearn.cluster, utils, itertools, islice, pandas.tools.plotting

### Create urban/rural designation variable

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Read Census urgan designation by zip code file
df = pd.read_csv('DEC_10_SF1_H2_with_ann.csv')
df.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,D001,D002,D003,D004,D005,D006
0,Id,Id2,Geography,Total:,Urban:,Urban: - Inside urbanized areas,Urban: - Inside urban clusters,Rural,Not defined for this file
1,8600000US00601,00601,ZCTA5 00601,7744,4457,0,4457,3287,0
2,8600000US00602,00602,ZCTA5 00602,18073,18073,18073,0,0,0
3,8600000US00603,00603,ZCTA5 00603,25653,25635,25635,0,18,0
4,8600000US00606,00606,ZCTA5 00606,2877,1251,0,1251,1626,0


In [4]:
# Create a new column designating rural '0' or urban '1'
df['urban'] = np.where(df['D002']>=df['D005'], 1, 0)

In [5]:
df

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,D001,D002,D003,D004,D005,D006,urban
0,Id,Id2,Geography,Total:,Urban:,Urban: - Inside urbanized areas,Urban: - Inside urban clusters,Rural,Not defined for this file,1
1,8600000US00601,00601,ZCTA5 00601,7744,4457,0,4457,3287,0,1
2,8600000US00602,00602,ZCTA5 00602,18073,18073,18073,0,0,0,1
3,8600000US00603,00603,ZCTA5 00603,25653,25635,25635,0,18,0,1
4,8600000US00606,00606,ZCTA5 00606,2877,1251,0,1251,1626,0,0
5,8600000US00610,00610,ZCTA5 00610,12618,11134,11134,0,1484,0,0
6,8600000US00612,00612,ZCTA5 00612,30992,29001,29001,0,1991,0,1
7,8600000US00616,00616,ZCTA5 00616,4896,4635,4635,0,261,0,1
8,8600000US00617,00617,ZCTA5 00617,10594,10395,10395,0,199,0,0
9,8600000US00622,00622,ZCTA5 00622,8714,4908,4908,0,3806,0,1


In [6]:
df.head()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,D001,D002,D003,D004,D005,D006,urban
0,Id,Id2,Geography,Total:,Urban:,Urban: - Inside urbanized areas,Urban: - Inside urban clusters,Rural,Not defined for this file,1
1,8600000US00601,00601,ZCTA5 00601,7744,4457,0,4457,3287,0,1
2,8600000US00602,00602,ZCTA5 00602,18073,18073,18073,0,0,0,1
3,8600000US00603,00603,ZCTA5 00603,25653,25635,25635,0,18,0,1
4,8600000US00606,00606,ZCTA5 00606,2877,1251,0,1251,1626,0,0


In [7]:
df.tail()

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,D001,D002,D003,D004,D005,D006,urban
33116,8600000US99923,99923,ZCTA5 99923,90,0,0,0,90,0,0
33117,8600000US99925,99925,ZCTA5 99925,400,0,0,0,400,0,0
33118,8600000US99926,99926,ZCTA5 99926,544,0,0,0,544,0,0
33119,8600000US99927,99927,ZCTA5 99927,101,0,0,0,101,0,0
33120,8600000US99929,99929,ZCTA5 99929,1339,0,0,0,1339,0,0


In [8]:
# Rename GEO.id2 to id2
df.rename(columns = {'GEO.id2':'id2'}, inplace=True)
df.head()

Unnamed: 0,GEO.id,id2,GEO.display-label,D001,D002,D003,D004,D005,D006,urban
0,Id,Id2,Geography,Total:,Urban:,Urban: - Inside urbanized areas,Urban: - Inside urban clusters,Rural,Not defined for this file,1
1,8600000US00601,00601,ZCTA5 00601,7744,4457,0,4457,3287,0,1
2,8600000US00602,00602,ZCTA5 00602,18073,18073,18073,0,0,0,1
3,8600000US00603,00603,ZCTA5 00603,25653,25635,25635,0,18,0,1
4,8600000US00606,00606,ZCTA5 00606,2877,1251,0,1251,1626,0,0


In [9]:
# Select features id2 and urban
features = ['id2', 'urban']

In [10]:
# Save features as new data frame
desig = df[features].copy()

In [11]:
desig.head()

Unnamed: 0,id2,urban
0,Id2,1
1,00601,1
2,00602,1
3,00603,1
4,00606,0


In [12]:
desig.tail()

Unnamed: 0,id2,urban
33116,99923,0
33117,99925,0
33118,99926,0
33119,99927,0
33120,99929,0


In [13]:
desig.shape

(33121, 2)

In [14]:
desig.urban.value_counts()

0    22309
1    10812
Name: urban, dtype: int64

In [15]:
desig.urban.value_counts(normalize=True)

0    0.673561
1    0.326439
Name: urban, dtype: float64

In [16]:
desig.describe()

Unnamed: 0,urban
count,33121.0
mean,0.326439
std,0.468917
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [17]:
# Read Opioids CSV file
df2 = pd.read_csv('OpioidsWKNG.csv')
df2.head()

Unnamed: 0,Vrid,Vdatesub,Vstatus,Vcid,Vcomment,Vlanguage,Vreferer,Vsessionid,Vuseragent,Vip,...,var245rec,var247rec,var248rec,var249rec,var250rec,var251rec,var252rec,var253rec,var254rec,var255rec
0,17,3/20/2018,Complete,,,English,,1521528719_5ab0af8fe318c0.83350522,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,47.40.144.98,...,2,2,2,1,1,2,1,1,1,1
1,18,3/20/2018,Complete,,,English,https://s.cint.com/Consent/Collect/9ed49688-a2...,1521528831_5ab0afff9dad82.74757954,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,24.99.168.150,...,1,1,2,1,2,1,1,2,1,1
2,22,3/20/2018,Complete,,,English,,1521528941_5ab0b06d95d276.07495051,Mozilla/5.0 (Linux; Android 7.0; Moto G (4) Bu...,47.151.21.204,...,1,2,1,1,1,1,1,2,2,2
3,23,3/20/2018,Complete,,,English,https://s.cint.com/Consent/Collect/ed2140dc-95...,1521528964_5ab0b084821f35.43447059,Mozilla/5.0 (X11; CrOS x86_64 8872.73.0) Apple...,98.200.10.6,...,1,2,1,1,2,1,1,1,2,2
4,24,3/20/2018,Complete,,,English,https://s.cint.com/Consent/Collect/0529624f-1a...,1521528989_5ab0b09d6dc659.59506533,Mozilla/5.0 (iPhone; CPU iPhone OS 8_4 like Ma...,174.210.7.12,...,1,2,1,1,2,1,1,2,2,2


In [35]:
df2.Vpostal.value_counts()

         99
10006    18
33132    10
20019     7
80238     6
77450     4
33018     4
33074     4
90009     4
98103     4
20011     4
80209     4
80203     4
75211     4
02740     4
90028     4
80206     4
77063     3
80634     3
94541     3
01420     3
14009     3
60402     3
22314     3
77077     3
80014     3
76028     3
60628     3
08816     3
80218     3
         ..
98208     1
33511     1
76132     1
21044     1
10021     1
11235     1
20016     1
94086     1
80249     1
77099     1
77429     1
60410     1
80123     1
77566     1
40291     1
98422     1
08232     1
23320     1
92880     1
10463     1
30087     1
21223     1
20852     1
77591     1
77007     1
77459     1
77386     1
21228     1
02333     1
95121     1
Name: Vpostal, Length: 830, dtype: int64

In [25]:
df2.Vpostal

0       49048
1       30022
2       92683
3       77036
4       90026
5       60073
6       98501
7       77040
8       10463
9       90013
10      95124
11      76033
12      10016
13      01463
14      75134
15      90004
16      75115
17      33025
18      94127
19      75270
20      92126
21      97403
22      91741
23      92415
24      30039
25      90038
26      60438
27      95132
28      92882
29      98405
        ...  
1170    20003
1171    20170
1172    22310
1173    22554
1174    33837
1175    84604
1176    20004
1177    20874
1178    20110
1179    21223
1180    20171
1181    20175
1182    20191
1183    20003
1184    22310
1185    83001
1186    20871
1187    79845
1188    20176
1189    20170
1190    22060
1191         
1192    11768
1193    30152
1194         
1195    33065
1196         
1197    30032
1198    44313
1199    72712
Name: Vpostal, Length: 1200, dtype: object

In [None]:
# Merge urban/rural designation (desig) into Opioids dataset (df2) using zipcode as the key
