# Machine Learning Tutorial
This is the tutorial in question: https://www.youtube.com/watch?v=7eh4d6sabA0

In [3]:
import pandas as pd
data_frame = pd.read_csv('vgsales.csv')
data_frame.shape

(16598, 11)

In [5]:
data_frame.describe()

Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,16598.0,16327.0,16598.0,16598.0,16598.0,16598.0,16598.0
mean,8300.605254,2006.406443,0.264667,0.146652,0.077782,0.048063,0.537441
std,4791.853933,5.828981,0.816683,0.505351,0.309291,0.188588,1.555028
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4151.25,2003.0,0.0,0.0,0.0,0.0,0.06
50%,8300.5,2007.0,0.08,0.02,0.0,0.01,0.17
75%,12449.75,2010.0,0.24,0.11,0.04,0.04,0.47
max,16600.0,2020.0,41.49,29.02,10.22,10.57,82.74


In [6]:
data_frame.values

array([[1, 'Wii Sports', 'Wii', ..., 3.77, 8.46, 82.74],
       [2, 'Super Mario Bros.', 'NES', ..., 6.81, 0.77, 40.24],
       [3, 'Mario Kart Wii', 'Wii', ..., 3.79, 3.31, 35.82],
       ...,
       [16598, 'SCORE International Baja 1000: The Official Game', 'PS2',
        ..., 0.0, 0.0, 0.01],
       [16599, 'Know How 2', 'DS', ..., 0.0, 0.0, 0.01],
       [16600, 'Spirits & Spells', 'GBA', ..., 0.0, 0.0, 0.01]],
      dtype=object)

Okay the above was just playing around with Jupyter and figuring out what it can do.

Now it's the real deal.

In [7]:
music_data =  pd.read_csv('music.csv')
music_data

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


In [8]:
X = music_data.drop(columns=['genre'])
X

Unnamed: 0,age,gender
0,20,1
1,23,1
2,25,1
3,26,1
4,29,1
5,30,1
6,31,1
7,33,1
8,37,1
9,20,0


In [9]:
Y = music_data['genre']
Y

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object

In [11]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X.values, Y)
predictions = model.predict([[21, 1], [22, 0]])
predictions

array(['HipHop', 'Dance'], dtype=object)

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# This allocates 20% of the data for testing, the other 80% will be for training.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)

score = accuracy_score(Y_test, predictions)
score

0.5

In [36]:
import joblib

# Note the 100% correct thing to do here would be to use training data.
model.fit(X.values, Y)

joblib.dump(model, 'music-recommender.joblib')

['music-recommender.joblib']

In [37]:
model = joblib.load('music-recommender.joblib')
predictions = model.predict([[21, 1]])
predictions

array(['HipHop'], dtype=object)

In [38]:
from sklearn import tree

tree.export_graphviz(
    model, 
    out_file='music-recommender.dot', 
    feature_names=['age', 'gender'],
    class_names=sorted(Y.unique()),
    label='all',
    rounded=True,
    filled=True
)