In [88]:
# Import required modules
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [89]:
# Import Data File into a dataframe
data_path="../data/music.csv"
music_data = pd.read_csv(data_path)

In [90]:
# Explore data file - Shape
# 18 rolls, 3 columns
music_data.shape

(18, 3)

In [91]:
# See summary statistics for columns
music_data.describe()

Unnamed: 0,age,gender
count,18.0,18.0
mean,27.944444,0.5
std,5.12746,0.514496
min,20.0,0.0
25%,25.0,0.0
50%,28.0,0.5
75%,31.0,1.0
max,37.0,1.0


In [92]:
# describe only includes numerical columns
# use include=all to include non-numerical column

music_data_all=music_data.describe(include='all')
print(music_data_all)

              age     gender      genre
count   18.000000  18.000000         18
unique        NaN        NaN          5
top           NaN        NaN  Classical
freq          NaN        NaN          6
mean    27.944444   0.500000        NaN
std      5.127460   0.514496        NaN
min     20.000000   0.000000        NaN
25%     25.000000   0.000000        NaN
50%     28.000000   0.500000        NaN
75%     31.000000   1.000000        NaN
max     37.000000   1.000000        NaN


In [93]:
# Convert dataframe to numpy array
music_data.values

array([[20, 1, 'HipHop'],
       [23, 1, 'HipHop'],
       [25, 1, 'HipHop'],
       [26, 1, 'Jazz'],
       [29, 1, 'Jazz'],
       [30, 1, 'Jazz'],
       [31, 1, 'Classical'],
       [33, 1, 'Classical'],
       [37, 1, 'Classical'],
       [20, 0, 'Dance'],
       [21, 0, 'Dance'],
       [25, 0, 'Dance'],
       [26, 0, 'Acoustic'],
       [27, 0, 'Acoustic'],
       [30, 0, 'Acoustic'],
       [31, 0, 'Classical'],
       [34, 0, 'Classical'],
       [35, 0, 'Classical']], dtype=object)

In [94]:
# Split data into input set and output set

# Input set (X): drop 'genre' column
X = music_data.drop(columns=["genre"]) 
print("Input Data Set: \n", X)

# Output set (y): select only the 'genre' column
y = music_data['genre'] 
print("Output Data Set or Prediction: \n", y)

Input Data Set: 
     age  gender
0    20       1
1    23       1
2    25       1
3    26       1
4    29       1
5    30       1
6    31       1
7    33       1
8    37       1
9    20       0
10   21       0
11   25       0
12   26       0
13   27       0
14   30       0
15   31       0
16   34       0
17   35       0
Output Data Set or Prediction: 
 0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object


In [95]:
# Build Model
model = DecisionTreeClassifier() # Create instance of DEscisionTreeClassifier

In [96]:
# Fit Model - Train Model
model.fit(X,y)
music_data # inspect data


Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


In [97]:
# predict genre for 21 year-old male and 22 year-old femail
predictions = model.predict([[21,1],[22,0]]) # predict() takes in a 2D array
print(predictions)

['HipHop' 'Dance']




In [224]:
# Split dataset into traing and testing data sets
X = music_data.drop(columns=["genre"])
y = music_data["genre"]
# train_test_split(X, y, test_size=0.2) # returns a tuple of 4 elements, 20% for test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # unpack tuple

# Measure Model Accuray
model = DecisionTreeClassifier() # Initialize a DecisionTreeClassifier
model.fit(X_train, y_train) # Train the model on the training data
predictions = model.predict(X_test) # Predict using the testing data

score = accuracy_score(y_test, predictions) # Measure accuracy between actual and predicted values
print("Accuray: ", score)




Accuray:  1.0
