In [8]:
import pandas as pd
df = pd.read_csv('vgsales.csv')
df.shape # two dimemsional array

(16598, 11)

In [9]:
df.describe()

Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,16598.0,16327.0,16598.0,16598.0,16598.0,16598.0,16598.0
mean,8300.605254,2006.406443,0.264667,0.146652,0.077782,0.048063,0.537441
std,4791.853933,5.828981,0.816683,0.505351,0.309291,0.188588,1.555028
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4151.25,2003.0,0.0,0.0,0.0,0.0,0.06
50%,8300.5,2007.0,0.08,0.02,0.0,0.01,0.17
75%,12449.75,2010.0,0.24,0.11,0.04,0.04,0.47
max,16600.0,2020.0,41.49,29.02,10.22,10.57,82.74


In [13]:
df.values # attribute and returns an array of values

array([[1, 'Wii Sports', 'Wii', ..., 3.77, 8.46, 82.74],
       [2, 'Super Mario Bros.', 'NES', ..., 6.81, 0.77, 40.24],
       [3, 'Mario Kart Wii', 'Wii', ..., 3.79, 3.31, 35.82],
       ...,
       [16598, 'SCORE International Baja 1000: The Official Game', 'PS2',
        ..., 0.0, 0.0, 0.01],
       [16599, 'Know How 2', 'DS', ..., 0.0, 0.0, 0.01],
       [16600, 'Spirits & Spells', 'GBA', ..., 0.0, 0.0, 0.01]],
      dtype=object)

In [14]:
# df.<tab> shows all methods and attributes
# df.describe() then control tab shows more details of the describe method
# control slash turns into a comment and vice versa

In [15]:
# 1 is a male , 0 is a female

In [16]:
music_data = pd.read_csv('music.csv')

In [17]:
music_data

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


## Cleaning up the data

In [18]:
# we should prepare our data:
# we should split this datas set into two separate data sets
# one with the first two columns, which we refer to as the input set
# and the other with the last column, we we refer to as the output set

In [19]:
X = music_data.drop(columns=['genre'])
X

Unnamed: 0,age,gender
0,20,1
1,23,1
2,25,1
3,26,1
4,29,1
5,30,1
6,31,1
7,33,1
8,37,1
9,20,0


In [22]:
y = music_data['genre']
y

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object

   ## Create/Build a Model using a Machine Learning Algorithm (i.e. decision tree)

In [27]:
# ./pip.exe install scikit-learn
# 'DecisionTreeClassifier' class implements the decision tree algorithm
from sklearn.tree import DecisionTreeClassifier

In [28]:
# create an instance of this class 'DecisionTreeClassifier'
model = DecisionTreeClassifier()

In [29]:
# next we need to train it, so it learns pattern in the data
model.fit(X, y)

In [31]:
# finally we need to ask our model to make a prediction, so we can ask it, what is the kind of music that a 21 year old male likes
# this method takes two dimensional array, there's an outer array [], inside this each element is an array
# so we are asking our model to make two predictions at the same time
# 21 year old male and a 22 year old female
# store the results to a variabke 'predictions'
predictions = model.predict([ [21, 1], [22, 0] ])
predictions



array(['HipHop', 'Dance'], dtype=object)

In [32]:
# Building a Model that makes predictions accurately is not that always easy
# after we build a model, we should measure its accuracy
# and if it is NOT accurate enough, we should fine tune it, or build a model with a different algorithm

## Measure the Accuracy of your Models

In [34]:
# first we should split our data sets into two sets
# one for training and the other for testing
# general rule of thumb, allocate 70% to 80% of our data for training and 30% to 20% for testing
from sklearn.model_selection import train_test_split
# 'train_test_split' is a function that easily split a data set to two sets (for training and for testing)

In [37]:
train_test_split(X, y, test_size=0.2) # we allocate 20% of our data for testing, and returns a Tuple

[    age  gender
 5    30       1
 12   26       0
 7    33       1
 13   27       0
 16   34       0
 1    23       1
 15   31       0
 0    20       1
 14   30       0
 6    31       1
 8    37       1
 11   25       0
 3    26       1
 10   21       0,
     age  gender
 4    29       1
 9    20       0
 2    25       1
 17   35       0,
 5          Jazz
 12     Acoustic
 7     Classical
 13     Acoustic
 16    Classical
 1        HipHop
 15    Classical
 0        HipHop
 14     Acoustic
 6     Classical
 8     Classical
 11        Dance
 3          Jazz
 10        Dance
 Name: genre, dtype: object,
 4          Jazz
 9         Dance
 2        HipHop
 17    Classical
 Name: genre, dtype: object]

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # we allocate 20% of our data for testing, and returns a Tuple
# X_train, X_test holds the input sets for training and testing respectively
# y_train, y_test holds the output sets for training and testing respectively

In [42]:
# when training our model, instead of passing the entire data sets, we only pass the training sets
model.fit(X_train, y_train)

In [43]:
# likewise, when making predictions, instead of passing the entire data sets, we only pass the data set that contains input values for testing
predictions = model.predict(X_test)

In [44]:
# we get the predictions above, to calculate the accuracy, we compare this predictions with the actual value for output set 'y_test' for testing
from sklearn.metrics import accuracy_score

In [46]:
accuracy_score(y_test, predictions) # this function returns an accuracy score between zero to one. one is 100%.

1.0

In [47]:
score = accuracy_score(y_test, predictions)
score

1.0

In [55]:
# say we change the test_size to 80% and train_size to 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy_score(y_test, predictions)
score = accuracy_score(y_test, predictions)
score

0.3333333333333333

In [56]:
# reason of low accuracy score is because of very little data for training the model

In [67]:
# say we change the test_size to 20% and train_size to 80%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy_score(y_test, predictions)
score = accuracy_score(y_test, predictions)
score

0.75

## Persisting Models

In [68]:
# in real life, a data set could have thousands and millions of samples
# training a model for that might take seconds, minutes, or hours
# so that is why model persistence is important
# once in a while, we build and train a model and then save it into a file
# next time we want to make predictions, we simply load the model from the file, and ask to make predictions, that model is already trained.

In [69]:
import joblib # joblib object has methods for saving and loading models

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model.fit(X_train, y_train)

joblib.dump(model, 'music-recommender.joblib')  # music-recommender.joblib is the filename

['music-recommender.joblib']

In [74]:
joblib.load('music-recommender.joblib') 

In [75]:
model = joblib.load('music-recommender.joblib') 

In [76]:
predictions = model.predict([[21, 1]])
predictions



array(['HipHop'], dtype=object)

## Visualizing Decision Trees

In [78]:
# we're going to export our model into a visual format, to see how our model makes predictions

In [79]:
from sklearn import tree

In [82]:
# after training the model
model = DecisionTreeClassifier()
model.fit(X, y)
tree.export_graphviz(model, out_file='music-recommender.dot', 
                     feature_names=['age', 'gender'],   # so we can see the rules in our nodes
                     class_names=sorted(y.unique()),    # example: class = Classical, class = Acoustic
                     label='all',  # so every node has labels that we can read
                     rounded=True, # so that each node would have rounded corners
                     filled=True) # each box or node is filled with a color