# Drug Consumption Final Project

This notebook is used to clean all of the data and do some exploratory data analysis. It also compiles each of the models generated to provide a solution to our research question. Finally, all the solutions are compared and a final best solution to the problem is chosen and discussed.

## Step 1: Import and Clean the Data

In [2]:
# Import Libraries
import pandas as pd
import numpy as np

In [3]:
# Import Dataset
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/00373/drug_consumption.data', sep=",", names=['Age','Gender','Education','Country','Ethnicity','Nscore','Escore','Oscore','Ascore','Cscore','Impulsive','SS','Alcohol','Amphet','Amyl','Benzos','Caff','Cannabis','Choc','Coke','Crack','Ecstasy','Heroin','Ketamine','Legalh','LSD','Meth','Mushrooms','Nicotine','Semer','VSA'])
data.head()

Unnamed: 0,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,Cscore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
1,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,-0.14277,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,0.58489,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,1.30612,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0


In [4]:
data['Gender'] = np.where((data['Gender'] == 0.48246), 1, 0) # 1 is female, 0 male
data['Age'] = np.where(data['Age'] == -0.95197, '18-24', np.where(data['Age'] == -0.07854, '25-34', np.where(data['Age'] == 0.49788, '35-44', np.where(data['Age'] == 1.09449, '45-54', np.where(data['Age'] == 1.82213, '55-64', np.where(data['Age'] == 2.59171, '65+', np.NAN))))))

edu = np.array([[-2.43591, 'Left school before 16 years'],
                [-1.73790, 'Left school at 16 years'],
                [-1.43719, 'Left school at 17 years'],
                [-1.22751, 'Left school at 18 years'],
                [-0.61113, 'Some college or university, no certificate or degree'],
                [-0.05921, 'Professional certificate/ diploma'],
                [0.45468, 'University degree'],
                [1.16365, 'Masters degree'],
                [1.98437, 'Doctorate degree']])
for i in range(edu.shape[0]):
    data['Education'] = np.where(data['Education'] == edu[i,0], edu[i,1], data['Education'])

country = np.array([[-0.09765, 'Australia'],
                    [0.24923, 'Canada'],
                    [-0.46841, 'New Zealand'],
                    [-0.28519, 'Other'],
                    [0.21128, 'Republic of Ireland'],
                    [0.96082, 'UK'],
                    [-0.57009, 'USA']])
for i in range(country.shape[0]):
    data['Country'] = np.where(data['Country'] == country[i,0], country[i,1], data['Country'])

ethnic = np.array([[-0.50212, 'Asian'],
                    [-1.10702, 'Black'],
                    [1.90725, 'Mixed-Black/Asian'],
                    [0.12600, 'Mixed-White/Asian'],
                    [-0.22166, 'Mixed-White/Black'],
                    [0.11440, 'Other'],
                    [-0.31685, 'White']])
for i in range(ethnic.shape[0]):
    data['Ethnicity'] = np.where(data['Ethnicity'] == ethnic[i,0], ethnic[i,1], data['Ethnicity'])

nscore = np.array([[12, -3.46436], [29, -0.67825], [46, 1.02119],
                    [13, -3.15735], [30, -0.58016], [47, 1.13281],
                    [14, -2.75696], [31, -0.46725], [48, 1.23461],
                    [15, -2.52197], [32, -0.34799], [49, 1.37297],
                    [16, -2.42317], [33, -0.24649], [50, 1.49158],
                    [17, -2.34360], [34, -0.14882], [51, 1.60383],
                    [18, -2.21844], [35, -0.05188], [52, 1.72012],
                    [19, -2.05048], [36, 0.04257], [53, 1.83990],
                    [20, -1.86962], [37, 0.13606], [54, 1.98437],
                    [21, -1.69163], [38, 0.22393], [55, 2.12700],
                    [22, -1.55078], [39, 0.31287], [56, 2.28554],
                    [23, -1.43907], [40, 0.41667], [57, 2.46262],
                    [24, -1.32828], [41, 0.52135], [58, 2.61139],
                    [25, -1.19430], [42, 0.62967], [59, 2.82196],
                    [26, -1.05308], [43, 0.73545], [60, 3.27393],
                    [27, -0.92104], [44, 0.82562],
                    [28, -0.79151], [45, 0.91093]])
for i in range(nscore.shape[0]):
    data['Nscore'] = np.where(data['Nscore'] == float(nscore[i,1]), nscore[i,0], data['Nscore'])

escore = np.array([[16, -3.27393], [31, -1.23177], [45, 0.80523],
                    [18, -3.00537], [32, -1.09207], [46, 0.96248],
                    [19, -2.72827], [33, -0.94779], [47, 1.11406],
                    [20, -2.53830], [34, -0.80615], [48, 1.28610],
                    [21, -2.44904], [35, -0.69509], [49, 1.45421],
                    [22, -2.32338], [36, -0.57545], [50, 1.58487],
                    [23, -2.21069], [37, -0.43999], [51, 1.74091],
                    [24, -2.11437], [38, -0.30033], [52, 1.93886],
                    [25, -2.03972], [39, -0.15487], [53, 2.12700],
                    [26, -1.92173], [40, 0.00332], [54, 2.32338],
                    [27, -1.76250], [41, 0.16767], [55, 2.57309],
                    [28, -1.63340], [42, 0.32197], [56, 2.85950],
                    [29, -1.50796], [43, 0.47617], [58, 3.00537],
                    [30, -1.37639], [44, 0.63779], [59, 3.27393]])
for i in range(escore.shape[0]):
    data['Escore'] = np.where(data['Escore'] == float(escore[i,1]), escore[i,0], data['Escore'])

oscore = np.array([[24, -3.27393], [38, -1.11902], [50, 0.58331],
                    [26, -2.85950], [39, -0.97631], [51, 0.72330],
                    [28, -2.63199], [40, -0.84732], [52, 0.88309],
                    [29, -2.39883], [41, -0.71727], [53, 1.06238],
                    [30, 2.21069], [42, -0.58331], [54, 1.24033],
                    [31, 2.09015], [43, -0.45174], [55, 1.43533],
                    [32, -1.97495], [44, -0.31776], [56, 1.65653],
                    [33, -1.82919], [45, -0.17779], [57, 1.88511],
                    [34, -1.68062], [46, -0.01928], [58, 2.15324],
                    [35, -1.55521], [47, 0.14143], [59, 2.44904],
                    [36, -1.42424], [48, 0.29338], [60, 2.90161],
                    [37, -1.27553], [49, 0.44585]])
for i in range(oscore.shape[0]):
    data['Oscore'] = np.where(data['Oscore'] == float(oscore[i,1]), oscore[i,0], data['Oscore'])

ascore = np.array([[12, -3.46436], [34, -1.34289], [48, 0.76096],
                    [16, -3.15735], [35, -1.21213], [49, 0.94156],
                    [18, -3.00537], [36, -1.07533], [50, 1.11406],
                    [23, -2.90161], [37, -0.91699], [51, 1.2861],
                    [24, -2.78793], [38, -0.76096], [52, 1.45039],
                    [25, -2.70172], [39, -0.60633], [53, 1.61108],
                    [26, -2.53830], [40, -0.45321], [54, 1.81866],
                    [27, -2.35413], [41, -0.30172], [55, 2.03972],
                    [28, -2.21844], [42, -0.15487], [56, 2.23427],
                    [29, -2.07848], [43, -0.01729], [57, 2.46262],
                    [30, -1.92595], [44, 0.13136], [58, 2.75696],
                    [31, -1.77200], [45, 0.28783], [59, 3.15735],
                    [32, -1.62090], [46, 0.43852], [60, 3.46436],
                    [33, -1.47955], [47, 0.59042]])
for i in range(ascore.shape[0]):
    data['Ascore'] = np.where(data['Ascore'] == float(ascore[i,1]), ascore[i,0], data['Ascore'])

cscore = np.array([[17, -3.46436], [32, -1.25773], [46, 0.58489],
                    [19, -3.15735], [33, -1.13788], [47, 0.7583],
                    [20, -2.90161], [34, -1.01450], [48, 0.93949],
                    [21, -2.72827], [35, -0.89891], [49, 1.13407],
                    [22, -2.57309], [36, -0.78155], [50, 1.30612],
                    [23, -2.42317], [37, -0.65253], [51, 1.46191],
                    [24, -2.30408], [38, -0.52745], [52, 1.63088],
                    [25, -2.18109], [39, -0.40581], [53, 1.81175],
                    [26, -2.04506], [40, -0.27607], [54, 2.04506],
                    [27, -1.92173], [41, -0.14277], [55, 2.33337],
                    [28, -1.78169], [42, -0.00665], [56, 2.63199],
                    [29, -1.64101], [43, 0.12331], [57, 3.00537],
                    [30, -1.51840], [44, 0.25953], [59, 3.46436],
                    [31, -1.38502], [45, 0.41594]])
for i in range(cscore.shape[0]):
    data['Cscore'] = np.where(data['Cscore'] == float(cscore[i,1]), cscore[i,0], data['Cscore'])

# impulsiveness, SS are already clean

data.head()

Unnamed: 0,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,Cscore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
1,35-44,1,Professional certificate/ diploma,UK,Mixed-White/Asian,39.0,36.0,42.0,37.0,42.0,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
2,25-34,0,Doctorate degree,UK,White,29.0,52.0,55.0,48.0,41.0,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
3,35-44,0,Professional certificate/ diploma,UK,White,31.0,45.0,40.0,32.0,34.0,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
4,18-24,1,Masters degree,UK,White,34.0,34.0,46.0,47.0,46.0,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
5,35-44,1,Doctorate degree,UK,White,43.0,28.0,43.0,41.0,50.0,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0


In [5]:
data = data.drop('Semer', 1)

  data = data.drop('Semer', 1)


In [7]:
drug_labels = ['Alcohol','Amphet','Amyl','Benzos','Caff','Choc','Coke','Crack','Ecstasy','Heroin','Ketamine','Legalh','LSD','Meth','Mushrooms','Nicotine','VSA']
for i in data.columns:
    print(data[i].isna().sum())

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [8]:
for drug in drug_labels:
    print(data.groupby(drug).size())

Alcohol
CL0     34
CL1     34
CL2     68
CL3    198
CL4    287
CL5    759
CL6    505
dtype: int64
Amphet
CL0    976
CL1    230
CL2    243
CL3    198
CL4     75
CL5     61
CL6    102
dtype: int64
Amyl
CL0    1305
CL1     210
CL2     237
CL3      92
CL4      24
CL5      14
CL6       3
dtype: int64
Benzos
CL0    1000
CL1     116
CL2     234
CL3     236
CL4     120
CL5      84
CL6      95
dtype: int64
Caff
CL0      27
CL1      10
CL2      24
CL3      60
CL4     106
CL5     273
CL6    1385
dtype: int64
Choc
CL0     32
CL1      3
CL2     10
CL3     54
CL4    296
CL5    683
CL6    807
dtype: int64
Coke
CL0    1038
CL1     160
CL2     270
CL3     258
CL4      99
CL5      41
CL6      19
dtype: int64
Crack
CL0    1627
CL1      67
CL2     112
CL3      59
CL4       9
CL5       9
CL6       2
dtype: int64
Ecstasy
CL0    1021
CL1     113
CL2     234
CL3     277
CL4     156
CL5      63
CL6      21
dtype: int64
Heroin
CL0    1605
CL1      68
CL2      94
CL3      65
CL4      24
CL5      16
CL6      13
d

In [9]:
data.groupby('Ethnicity').size()

Ethnicity
-0.50212               26
Black                  33
Mixed-Black/Asian       3
Mixed-White/Asian      20
Mixed-White/Black      20
Other                  63
White                1720
dtype: int64

In [10]:
for drug in drug_labels:
    data[drug] = np.where(data[drug] == 'CL0', 0, 1)
    # not a user = 0, user = 1
data[drug_labels]

Unnamed: 0,Alcohol,Amphet,Amyl,Benzos,Caff,Choc,Coke,Crack,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,VSA
1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0
2,1,1,1,0,1,1,1,0,1,0,1,0,1,1,0,1,0
3,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0
4,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,1,0
5,1,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1884,1,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,1
1885,1,0,0,0,1,1,0,0,1,0,0,1,1,1,1,1,0
1886,1,1,1,1,1,1,1,0,1,0,1,0,1,0,1,1,0
1887,1,0,0,0,1,1,0,0,1,0,0,1,1,0,1,1,0


In [11]:
drugs = pd.DataFrame()
for drug in drug_labels:
    drugs[drug] = data.groupby(drug).size()
drugs.index.name = 'Drug User'
drugs

Unnamed: 0_level_0,Alcohol,Amphet,Amyl,Benzos,Caff,Choc,Coke,Crack,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,VSA
Drug User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,34,976,1305,1000,27,32,1038,1627,1021,1605,1490,1094,1069,1429,982,428,1455
1,1851,909,580,885,1858,1853,847,258,864,280,395,791,816,456,903,1457,430


In [13]:
data.head()

Unnamed: 0,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,Cscore,...,Crack,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,VSA
1,35-44,1,Professional certificate/ diploma,UK,Mixed-White/Asian,39.0,36.0,42.0,37.0,42.0,...,0,0,0,0,0,0,0,0,1,0
2,25-34,0,Doctorate degree,UK,White,29.0,52.0,55.0,48.0,41.0,...,0,1,0,1,0,1,1,0,1,0
3,35-44,0,Professional certificate/ diploma,UK,White,31.0,45.0,40.0,32.0,34.0,...,0,0,0,0,0,0,0,1,0,0
4,18-24,1,Masters degree,UK,White,34.0,34.0,46.0,47.0,46.0,...,0,0,0,1,0,0,0,0,1,0
5,35-44,1,Doctorate degree,UK,White,43.0,28.0,43.0,41.0,50.0,...,0,1,0,0,1,0,0,1,1,0


In [21]:
datatest = data.copy()
datatest['new'] = np.where((datatest['Heroin'] == 1) | (datatest['Ketamine'] == 1), 1, 0)
datatest

Unnamed: 0,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,Cscore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,VSA,new
1,35-44,1,Professional certificate/ diploma,UK,Mixed-White/Asian,39.0,36.0,42.0,37.0,42.0,...,0,0,0,0,0,0,0,1,0,0
2,25-34,0,Doctorate degree,UK,White,29.0,52.0,55.0,48.0,41.0,...,1,0,1,0,1,1,0,1,0,1
3,35-44,0,Professional certificate/ diploma,UK,White,31.0,45.0,40.0,32.0,34.0,...,0,0,0,0,0,0,1,0,0,0
4,18-24,1,Masters degree,UK,White,34.0,34.0,46.0,47.0,46.0,...,0,0,1,0,0,0,0,1,0,1
5,35-44,1,Doctorate degree,UK,White,43.0,28.0,43.0,41.0,50.0,...,1,0,0,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1884,18-24,1,"Some college or university, no certificate or ...",USA,White,25.0,51.0,57.0,48.0,33.0,...,0,0,0,1,1,0,0,0,1,0
1885,18-24,0,"Some college or university, no certificate or ...",USA,White,33.0,51.0,50.0,48.0,30.0,...,1,0,0,1,1,1,1,1,0,0
1886,25-34,1,University degree,USA,White,47.0,30.0,37.0,31.0,31.0,...,1,0,1,0,1,0,1,1,0,1
1887,18-24,1,"Some college or university, no certificate or ...",USA,White,45.0,26.0,48.0,32.0,22.0,...,1,0,0,1,1,0,1,1,0,0


In [23]:
data['Sch1'] = np.where((datatest['Heroin'] == 1) | (datatest['LSD'] == 1) | (datatest['Ecstasy'] == 1) | (datatest['Mushrooms'] == 1), 1, 0)
data['Sch2'] = np.where((datatest['Amphet'] == 1) | (datatest['Coke'] == 1) | (datatest['Crack'] == 1) | (datatest['Meth'] == 1), 1, 0)
data['Sch3'] = np.where((datatest['Ketamine'] == 1), 1, 0)
data['Sch4'] = np.where((datatest['Benzos'] == 1), 1, 0)
data['SchNA'] = np.where((datatest['Alcohol'] == 1) | (datatest['Amyl'] == 1) | (datatest['Caff'] == 1) | (datatest['Choc'] == 1) | (datatest['Legalh'] == 1) | (datatest['Nicotine'] == 1) | (datatest['VSA'] == 1), 1, 0)

In [24]:
data.head()

Unnamed: 0,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,Cscore,...,LSD,Meth,Mushrooms,Nicotine,VSA,Sch1,Sch2,Sch3,Sch4,SchNA
1,35-44,1,Professional certificate/ diploma,UK,Mixed-White/Asian,39.0,36.0,42.0,37.0,42.0,...,0,0,0,1,0,0,1,0,1,1
2,25-34,0,Doctorate degree,UK,White,29.0,52.0,55.0,48.0,41.0,...,1,1,0,1,0,1,1,1,0,1
3,35-44,0,Professional certificate/ diploma,UK,White,31.0,45.0,40.0,32.0,34.0,...,0,0,1,0,0,1,0,0,0,1
4,18-24,1,Masters degree,UK,White,34.0,34.0,46.0,47.0,46.0,...,0,0,0,1,0,0,1,1,1,1
5,35-44,1,Doctorate degree,UK,White,43.0,28.0,43.0,41.0,50.0,...,0,0,1,1,0,1,1,0,0,1


In [29]:
data.shape

(1885, 35)

In [27]:
data.groupby('Country').size()

Country
-0.09765                 54
Canada                   87
New Zealand               5
Other                   118
Republic of Ireland      20
UK                     1044
USA                     557
dtype: int64

In [26]:
#data.to_csv("drugs.csv", index=False)

In [14]:
drug_labels = ['Alcohol','Amphet','Amyl','Benzos','Caff','Choc','Coke','Crack','Ecstasy','Heroin','Ketamine','Legalh','LSD','Meth','Mushrooms','Nicotine','VSA']
sched_1 = ['Heroin','LSD','Ecstasy','Mushrooms']
sched_2 = ['Amphet','Coke','Crack','Meth']
sched_3 = ['Ketamine']
sched_4 = ['Benzos']
not_controlled_substance = ['Alcohol','Amyl','Caff','Choc','Legalh','Nicotine','VSA']
target_label = ['Cannabis']

## Step 2: Models

### Tarini

### Priya

### Zach

### Sam

## Step 3: Discussion of Solutions

## Step 4: Best Solution

## Step 5: Conclusion