In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Importing the ML parts
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz

## Load the CSV data

1. Clean data from duplicates
2. Clean data from null, or replce with values (Mean)


In [41]:
data = pd.read_csv('hobbies.csv')
data.head()

Unnamed: 0,id,gender,name,age,hobby
0,1.0,m,Olivia Brandon,60.0,Reading
1,2.0,m,Willie Adam,44.0,Travel
2,3.0,m,Vincent Arthur,34.0,Travel
3,4.0,f,Janet Roger,22.0,Fasion
4,5.0,f,Joe Stephen,26.0,Fasion


In [42]:
data.describe()

Unnamed: 0,id,age
count,493.0,497.0
mean,250.947262,55.557344
std,145.456617,20.340777
min,1.0,20.0
25%,124.0,38.0
50%,254.0,56.0
75%,377.0,72.0
max,500.0,90.0


In [43]:
data.isnull().sum()

id        7
gender    0
name      3
age       3
hobby     0
dtype: int64

In [44]:
data.age.fillna(data.age.mean(), inplace=True)
data.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data.age.fillna(data.age.mean(), inplace=True)


id        7
gender    0
name      3
age       0
hobby     0
dtype: int64

In [45]:
data.head()

Unnamed: 0,id,gender,name,age,hobby
0,1.0,m,Olivia Brandon,60.0,Reading
1,2.0,m,Willie Adam,44.0,Travel
2,3.0,m,Vincent Arthur,34.0,Travel
3,4.0,f,Janet Roger,22.0,Fasion
4,5.0,f,Joe Stephen,26.0,Fasion


In [46]:
X = data.drop(['id', 'name'], axis=1, inplace=False)

X.head()

Unnamed: 0,gender,age,hobby
0,m,60.0,Reading
1,m,44.0,Travel
2,m,34.0,Travel
3,f,22.0,Fasion
4,f,26.0,Fasion


In [47]:
X = X.drop_duplicates()
X.shape

(140, 3)

In [48]:
y = X['hobby']
X.drop(['hobby'], axis=1, inplace=True)
y.head()

0    Reading
1     Travel
2     Travel
3     Fasion
4     Fasion
Name: hobby, dtype: object

In [49]:
X.head()

Unnamed: 0,gender,age
0,m,60.0
1,m,44.0
2,m,34.0
3,f,22.0
4,f,26.0


In [51]:
def gender_to_num(x):
    return 1 if x == 'm' else 0

In [52]:
X.gender = X.gender.apply(gender_to_num)
X.head()

Unnamed: 0,gender,age
0,1,60.0
1,1,44.0
2,1,34.0
3,0,22.0
4,0,26.0


### Model Building


In [55]:
y.shape

(140,)

In [66]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=23)

In [67]:
model = DecisionTreeClassifier()

model.fit(X_train, y_train)

In [68]:
y_pred = model.predict(X_test)
score = accuracy_score(y_test, y_pred)

score

1.0

In [69]:
import joblib

joblib.dump(model, 'hobbies_model.pkl')

['hobbies_model.pkl']

In [70]:
export_graphviz(model,
                out_file='hobbies_model.dot',
                feature_names=X.columns,
                class_names=model.classes_,
                filled=True)