# Linear Classifiers in Python

In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Using Logistic Regression with default parameters using Wine Dataset

In [2]:
# import wine datasets 
import sklearn.datasets

wine = sklearn.datasets.load_wine()

# seperate data and label
X, y = wine.data, wine.target

In [3]:
# import train_test_split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,  random_state = 42)

In [4]:
# import Logistic Regression
from sklearn.linear_model import LogisticRegression

# initialize LogReg
lr = LogisticRegression()

In [5]:
# fit data
lr.fit(X_train, y_train)

# prediction
y_pred = lr.predict(X_test)

# score
lr.score(X_test, y_test)

0.9814814814814815

## Using LinearSVC with default parameters Using Digits Dataset

In [6]:
digits = sklearn.datasets.load_digits()

X, y = digits.data, digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,  random_state = 42)

In [7]:
# import LinearSVC
from sklearn.svm import LinearSVC

# initialize LinearSVC
lin_svc = LinearSVC()

In [8]:
# fit data
lin_svc.fit(X_train, y_train)

# prediction
y_pred = lin_svc.predict(X_test)

# score
lin_svc.score(X_test, y_test)

0.9333333333333333

## Breast Cancer Datasets

In [19]:
breast_cancer = sklearn.datasets.load_breast_cancer()

X, y = breast_cancer.data, breast_cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,  random_state = 123)

In [20]:
# fit data
lin_svc.fit(X_train, y_train)

# prediction
y_pred = lin_svc.predict(X_test)

# score
lin_svc.score(X_test, y_test)

0.9473684210526315

# Multi Class Logistic regression

In [11]:
# fish datasets
fish = pd.read_csv('data/csv/fish.csv', header=None)
fish.columns = ['Species', 1, 2, 3, 4, 5, 6]

In [12]:
X = fish.drop(columns= 'Species')
y = fish['Species']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,  random_state = 42)

In [14]:
lr_ovr = LogisticRegression()
lr_ovr.fit(X_train, y_train)

lr_ovr.coef_.shape

(4, 6)

In [15]:
lr_ovr.intercept_.shape

(4,)

In [16]:
# train accuracy
print(lr_ovr.score(X_train, y_train))

# test accuracy
lr_ovr.score(X_test, y_test)

1.0


0.9615384615384616

## Support Vector Machine

In [17]:
from sklearn.svm import SVC
svm = SVC(kernel='linear')

svm.fit(X_train, y_train)

SVC(kernel='linear')

In [18]:
print(len(X))
print(len(svm.support_))

85
8


## Logistic Regression
- is a linear classifier
- can use with kernels but slow
- meaningful probabilities
- can be extended to multiclass
- All data points affect fit
- linear_model.LogisticRegression
- parameter : C (inverse regularization strength), penalty (type regularization), multi_class (type multi_class)

## Support Vector Machine
- is a linear classifier
- can use kernels and fast
- can be extended to multiclass
- only 'support Vector' affect fit
- svm.LinearSVC or svm.SVC
- parameter : C (inverse regularization strength), kernel (type of kernel), gamma (inverse RBF smoothness)

## SGDClassifier
- scales well to large datasets (very fast)
- linear_model.SGDClassifier
- logreg = SGDClassifier(loss='log'), linsvm = SGDClassifier(loss='hinge')
- parameter = alpha (1/C)