# My Notebook from following along with [Python Machine Learning Tutorial](https://www.youtube.com/watch?v=7eh4d6sabA0)

## James Small
## CIS-579-002, Introduction to Artificial Intelligence

In [53]:
'''
Load data, split between testing and training, train model, make predictions, and score
'''
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

music_data = pd.read_csv('music.csv')
X = music_data.drop(columns=['genre'])  # Input dataset
y = music_data['genre']  # Output dataset
# Set aside 20% of data for testing:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Manual predictions for testing:
# predictions = model.predict([[21, 1], [22, 0]])
predictions = model.predict(X_test)
score = accuracy_score(y_test, predictions)
score

1.0

In [60]:
'''
Load data, train model, serialize to disk
'''
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import joblib

music_data = pd.read_csv('music.csv')
X = music_data.drop(columns=['genre'])  # Input dataset
y = music_data['genre']  # Output dataset

model = DecisionTreeClassifier()
# Per StackOverflow changed from X to X.values to eliminate the following warning from
# sklearn.utils.validation:
# UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
model.fit(X.values, y)

joblib.dump(model, 'music-recommender.joblib')

['music-recommender.joblib']

In [61]:
'''
Load model from serialized file and use to make predictions
'''
model = joblib.load('music-recommender.joblib')
predictions = model.predict([[21, 1]])
predictions

array(['HipHop'], dtype=object)

In [62]:
'''
Load data, train, visualilze model using Graphviz (create dot file)
Graphviz is a supplemental tool which renders the dot file for viewing
'''
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

music_data = pd.read_csv('music.csv')
X = music_data.drop(columns=['genre'])  # Input dataset
y = music_data['genre']  # Output dataset

model = DecisionTreeClassifier()
model.fit(X, y)

'''
Parameters:
* feature_names - features to use
* class_names - unique list of genres
* label - all nodes labeled
* rounded - graphviz box uses rounded corners
* filled - graphviz box filled with color
'''
tree.export_graphviz(model, out_file='music-recommender.dot', feature_names=['age', 'gender'],
                     class_names=sorted(y.unique()), label='all', rounded=True, filled=True)

# Rendered Graphviz dot file:

![alt text](tree.png "Decision Tree Visualization")

# Additional Questions

1) In AI we generally work with huge data sets, what are other sources you know like kaggle for getting access to these collections of sample data sets?
  * In addition to Kaggle, some other good open dataset sources include:
    * [data.gov](https://data.gov)
    * [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu)
    * [Earth Data from NASA](https://www.earthdata.nasa.gov)
    * [Common Crawl web crawl data](https://commoncrawl.org)
    * [ImageNet](https://www.image-net.org)
    * [Hugging Face Datasets](https://huggingface.co/docs/datasets/en/index)

2) After the implementation, do you feel comfortable working with ML tools and technologies? What are the challenges you faced and the learning you got from the implementation.
  * I am fluent in Python and comfortable with data analysis tools like numpy, pandas, and matplotlib.  I have never used scikit-learn before.  This is a good start; however, I feel I need to spend more time with scikit-learn to say I'm comfortable approaching machine learning with it.
