# Linear Classification - Learning Scikit-Learn - Ch 1

In [1]:
from sklearn import datasets
import numpy as np
import pandas as pd

## Save the dataset

In [11]:
iris=datasets.load_iris()

In [12]:
df=pd.DataFrame(iris.data,columns=iris.feature_names)

In [13]:
dico={key:value for key,value in enumerate(iris.target_names)}
df['target']=iris.target
df['target']=df['target'].map(dico)

In [14]:
filename='iris.csv'
df.to_csv(filename, index=False, encoding='utf-8')

## Load the DataSet

In [15]:
filename='iris.csv'
df = pd.read_csv(filename)
df.head(2)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [16]:
X_iris=np.array(df.iloc[:,:4])

In [18]:
dico={value:key for key,value in enumerate(list(set(df.iloc[:,4])))}
y_iris=np.array(df.iloc[:,4].map(dico))

## Train the Model

In [19]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [20]:
X=X_iris[:,:2]
y=y_iris

In [21]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.25,random_state=33)

In [22]:
scaler = preprocessing.StandardScaler()

In [23]:
X_train=scaler.fit_transform(X_train)

In [24]:
X_test=scaler.transform(X_test)

In [25]:
from sklearn.linear_model import SGDClassifier

In [26]:
clf=SGDClassifier()



In [27]:
clf.fit(X_train,y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

##  Evaluating the results

In [28]:
from sklearn import metrics

In [29]:
y_train_predict = clf.predict(X_train)
metrics.accuracy_score(y_train,y_train_predict)

0.8035714285714286

In [30]:
y_test_predict = clf.predict(X_test)
metrics.accuracy_score(y_test,y_test_predict)

0.73684210526315785

In [31]:
clf.score(X_test,y_test)

0.73684210526315785

In [32]:
invert_dico= {v: k for k, v in dico.items()}
print(metrics.classification_report(y_test,y_test_predict,target_names=[invert_dico[i] for i in range(len(invert_dico))]))
 

             precision    recall  f1-score   support

  virginica       0.80      0.63      0.71        19
 versicolor       0.53      0.73      0.62        11
     setosa       1.00      1.00      1.00         8

avg / total       0.76      0.74      0.74        38



In [33]:
print(metrics.confusion_matrix(y_test,y_test_predict))

[[12  7  0]
 [ 3  8  0]
 [ 0  0  8]]


## Validation Croisée

In [35]:
from sklearn.model_selection import KFold,cross_val_score

In [36]:
from sklearn.pipeline import Pipeline

In [37]:
clf=Pipeline([\
             ('scaler', preprocessing.StandardScaler()),\
             ('linear_model',SGDClassifier())])



In [38]:
#cv=KFold(X.shape[0],5,shuffle=True,random_state=33)

In [39]:
scores = cross_val_score(clf,X,y,cv=5)

In [40]:
from scipy.stats import sem

print('Accuracy: {:.2f} (+/- {:.2f}) '.format(scores.mean(), sem(scores) * 2))

Accuracy: 0.74 (+/- 0.05) 


## Verification

In [41]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.25)

In [42]:
scores = cross_val_score(clf,X_train,y_train,cv=5)

In [43]:
print('Accuracy: {:.2f} (+/- {:.2f}) '.format(scores.mean(), sem(scores) * 2))

Accuracy: 0.76 (+/- 0.05) 


In [44]:
clf.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('linear_model', SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))])

In [45]:
clf.score(X_test,y_test)

0.71052631578947367

In [46]:
y_test_predict = clf.predict(X_test)
metrics.accuracy_score(y_test,y_test_predict)

0.71052631578947367