In [13]:
from ucimlrepo import fetch_ucirepo
import pandas as pd

# Fetch dataset
drug_consumption_quantified = fetch_ucirepo(id=373)

# Load data as Pandas DataFrames
X = drug_consumption_quantified.data.features
y = drug_consumption_quantified.data.targets

In [14]:
# Display variable information
print("\nVariable Information:")
print(drug_consumption_quantified.variables)



Variable Information:
         name     role         type      demographic description units  \
0          id       ID      Integer             None        None  None   
1         age  Feature   Continuous              Age        None  None   
2      gender  Feature   Continuous           Gender        None  None   
3   education  Feature   Continuous  Education Level        None  None   
4     country  Feature   Continuous      Nationality        None  None   
5   ethnicity  Feature   Continuous        Ethnicity        None  None   
6      nscore  Feature   Continuous             None        None  None   
7      escore  Feature   Continuous             None        None  None   
8      oscore  Feature   Continuous             None        None  None   
9      ascore  Feature   Continuous             None        None  None   
10     cscore  Feature   Continuous             None        None  None   
11  impuslive  Feature   Continuous             None        None  None   
12         ss  

In [15]:
# Merge features and targets into a single DataFrame
df = pd.concat([X, y], axis=1)

# Explore the data
print("\nHead of the Dataset:")
print(df.head())


Head of the Dataset:
       age   gender  education  country  ethnicity   nscore   escore   oscore  \
0  0.49788  0.48246   -0.05921  0.96082    0.12600  0.31287 -0.57545 -0.58331   
1 -0.07854 -0.48246    1.98437  0.96082   -0.31685 -0.67825  1.93886  1.43533   
2  0.49788 -0.48246   -0.05921  0.96082   -0.31685 -0.46725  0.80523 -0.84732   
3 -0.95197  0.48246    1.16365  0.96082   -0.31685 -0.14882 -0.80615 -0.01928   
4  0.49788  0.48246    1.98437  0.96082   -0.31685  0.73545 -1.63340 -0.45174   

    ascore   cscore  impuslive       ss alcohol amphet amyl benzos caff  \
0 -0.91699 -0.00665   -0.21712 -1.18084     CL5    CL2  CL0    CL2  CL6   
1  0.76096 -0.14277   -0.71126 -0.21575     CL5    CL2  CL2    CL0  CL6   
2 -1.62090 -1.01450   -1.37983  0.40148     CL6    CL0  CL0    CL0  CL6   
3  0.59042  0.58489   -1.37983 -1.18084     CL4    CL0  CL0    CL3  CL5   
4 -0.30172  1.30612   -0.21712 -0.21575     CL4    CL1  CL1    CL0  CL6   

  cannabis choc coke crack ecstasy heroi

In [16]:
# Show all columns 
pd.set_option('display.max_columns', None)
df.columns


Index(['age', 'gender', 'education', 'country', 'ethnicity', 'nscore',
       'escore', 'oscore', 'ascore', 'cscore', 'impuslive', 'ss', 'alcohol',
       'amphet', 'amyl', 'benzos', 'caff', 'cannabis', 'choc', 'coke', 'crack',
       'ecstasy', 'heroin', 'ketamine', 'legalh', 'lsd', 'meth', 'mushrooms',
       'nicotine', 'semer', 'vsa'],
      dtype='object')

In [17]:
print(df.dtypes)

age          float64
gender       float64
education    float64
country      float64
ethnicity    float64
nscore       float64
escore       float64
oscore       float64
ascore       float64
cscore       float64
impuslive    float64
ss           float64
alcohol       object
amphet        object
amyl          object
benzos        object
caff          object
cannabis      object
choc          object
coke          object
crack         object
ecstasy       object
heroin        object
ketamine      object
legalh        object
lsd           object
meth          object
mushrooms     object
nicotine      object
semer         object
vsa           object
dtype: object


In [18]:
# Statistical summary of features
print("\nStatistical Summary of Features:")
print(X.describe())


Statistical Summary of Features:
              age       gender    education      country    ethnicity  \
count  1885.00000  1885.000000  1885.000000  1885.000000  1885.000000   
mean      0.03461    -0.000256    -0.003806     0.355542    -0.309577   
std       0.87836     0.482588     0.950078     0.700335     0.166226   
min      -0.95197    -0.482460    -2.435910    -0.570090    -1.107020   
25%      -0.95197    -0.482460    -0.611130    -0.570090    -0.316850   
50%      -0.07854    -0.482460    -0.059210     0.960820    -0.316850   
75%       0.49788     0.482460     0.454680     0.960820    -0.316850   
max       2.59171     0.482460     1.984370     0.960820     1.907250   

            nscore       escore       oscore       ascore       cscore  \
count  1885.000000  1885.000000  1885.000000  1885.000000  1885.000000   
mean      0.000047    -0.000163    -0.000534    -0.000245    -0.000386   
std       0.998106     0.997448     0.996229     0.997440     0.997523   
min      -3.

In [19]:
# Binary encoding for drug usage
for col in X.columns[12:]:
    X[col] = X[col].map({'CL0': 0, 'CL1': 1, 'CL2': 1, 'CL3': 1, 'CL4': 1, 'CL5': 1, 'CL6': 1})

# Map demographic variables for interpretability
age_mapping = {-0.95197: "18-24", -0.07854: "25-34", 0.49788: "35-44", 1.09449: "45-54", 1.82213: "55-64", 2.59171: "65+"}
gender_mapping = {0.48246: "Female", -0.48246: "Male"}
education_mapping = {
    -2.43591: "Left before 16", -1.73790: "Left at 16", -1.43719: "Left at 17", -1.22751: "Left at 18",
    -0.61113: "Some college", -0.05921: "Professional cert", 0.45468: "University degree",
    1.16365: "Masters degree", 1.98437: "Doctorate"
}

X['age'] = X['age'].map(age_mapping)
X['gender'] = X['gender'].map(gender_mapping)
X['education'] = X['education'].map(education_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['age'] = X['age'].map(age_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['gender'] = X['gender'].map(gender_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['education'] = X['education'].map(education_mapping)


In [20]:
import seaborn as sns
import matplotlib.pyplot as plt

# Bar plot for age vs. drug usage
sns.countplot(data=X, x='age', hue='cannabis')  # Replace 'cannabis' with any drug variable
plt.title("Age vs. Cannabis Usage")
plt.show()


ValueError: Could not interpret value `cannabis` for `hue`. An entry with this name does not appear in `data`.

In [None]:
# Correlation heatmap for numeric features
plt.figure(figsize=(12, 8))
sns.heatmap(X.iloc[:, 6:12].corr(), annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Matrix for Personality Traits")
plt.show()


In [None]:
X.iloc[:, 12:].sum().sort_values().plot(kind='barh', figsize=(8, 8))
plt.title("Frequency of Drug Usage")
plt.xlabel("Number of Users")
plt.ylabel("Drugs")
plt.show()


In [None]:
selected_features = ['nscore', 'escore', 'oscore', 'ascore', 'cscore', 'impulsive', 'ss', 'cannabis']
sns.pairplot(X[selected_features], hue='cannabis', diag_kind='kde')
plt.show()


In [None]:
# Box plot for sensation-seeking scores by drug usage
sns.boxplot(data=X, x='cannabis', y='ss')
plt.title("Sensation-Seeking vs. Cannabis Usage")
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Correlation Matrix
plt.figure(figsize=(12, 8))
corr_matrix = X.corr()
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Matrix")
plt.show()


In [None]:
# Pair Plot
# Including only a few key features for clarity
selected_features = ['Age', 'Education', 'Neuroticism', 'Openness']
sns.pairplot(data=pd.concat([X[selected_features], y], axis=1), hue=y.columns[0])
plt.show()

In [None]:
# Distribution of Target Variable
y.value_counts().plot(kind='bar')
plt.title("Distribution of Target Variable")
plt.show()