# An Introduction to Scikit learn with Quinlan's dataset
Pierre Nugues

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn import linear_model
from sklearn import svm

### Reading the dataset

In [None]:
data_file = 'weather-nominal.csv'
column_names = ['outlook', 'temperature', 'humidity', 'windy', 'play']
f = open(data_file).read()
rows = f.strip().split('\n')
rows

### Storing the dataset in a dictionary

The feature matrix needs to be represented by a dictionary

In [None]:
def split_rows(rows, column_names):
    """
    Creates a list of sentence where each sentence is a list of lines
    Each line is a dictionary of columns
    :param sentences:
    :param column_names:
    :return:
    """
    rows = [dict(zip(column_names, row.split(','))) for row in rows]
    return rows
dataset = split_rows(rows, column_names)
dataset

### Extracting the features 

We extract the features and the classes and we store them in `X_dict` and `y_symbols`

In [None]:
import copy

def extract_feats(dataset, class_name):
    X_dict = copy.deepcopy(dataset)
    y_symbols = [obs.pop(class_name, None) for obs in X_dict]
    return X_dict, y_symbols


X_dict, y = extract_feats(dataset, 'play')
print(X_dict)
print(y)

### Vectorizing the Dataset

#### The Features

Vectorize the feature matrix and carry out a one-hot encoding

In [None]:
vec = DictVectorizer(sparse=False) # Should be true
# vec = DictVectorizer(sparse=True) # Should be true
X = vec.fit_transform(X_dict)
X

#### The class vector

Scikit learn handles strings as output

In [None]:
y

### The Classifier

#### Building the model

With a numerical dataset, we can use a linear classifier and fit a model

In [None]:
classifier = linear_model.LogisticRegression()
# classifier = svm.SVC()
model = classifier.fit(X, y)
model

#### Predicting the classes

We have trained a classifier and we predict the classes

In [None]:
y_hat = classifier.predict(X)
y_hat

Should we use a test set (here the training set), we only need to use `transform` to vectorize the set

In [None]:
X_test_dict = X_dict
X_test = vec.transform(X_test_dict)
y_hat = classifier.predict(X_test)
y_hat