# Benchmark: Naive Logistic regression

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, confusion_matrix

In [3]:
from sklearn.linear_model import ElasticNet, LogisticRegression

In [4]:
red = pd.read_csv("data/red_normal.csv")
white = pd.read_csv("data/white_normal.csv")

In [5]:
red['type'] = 'red'
white['type'] = 'white'

In [6]:
wine = pd.concat([red, white], ignore_index=True).sample(frac=1).reset_index(drop=True)
#wine = white
#wine = red

In [7]:
temp = wine['quality']
wine.drop('quality', axis=1, inplace=True)
wine['quality'] = temp

In [8]:
wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric,sugar,chlorides,free_SD,total_SD,density,pH,sulphates,alcohol,type,quality
0,0.907683,-1.87193,0.970345,-0.898276,-0.827124,1.052956,0.147714,-0.448611,-0.849253,1.013829,0.166109,red,6
1,-1.0314,0.655613,-1.108839,-0.905237,-0.959515,0.849514,0.485654,-0.711281,1.468427,0.351832,0.394706,white,6
2,0.563072,-0.72109,0.662339,0.296646,-0.350928,-0.386538,-0.535598,0.876009,0.187107,0.541872,-0.209243,red,6
3,0.093554,-1.411915,-0.282528,0.6326,0.835623,0.888414,1.097446,1.261364,1.335977,-0.962507,-0.742932,white,6
4,0.563469,0.839495,2.774823,0.376294,0.611505,-2.049052,-0.902645,0.626106,0.011479,0.088964,0.313446,white,5


In [9]:
wine.to_csv("data/wine_normal.csv", index=False)

In [10]:
wine = pd.get_dummies(wine)

In [11]:
temp = wine['quality']
wine.drop('quality', axis=1, inplace=True)
wine['quality'] = temp

In [12]:
wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric,sugar,chlorides,free_SD,total_SD,density,pH,sulphates,alcohol,type_red,type_white,quality
0,0.907683,-1.87193,0.970345,-0.898276,-0.827124,1.052956,0.147714,-0.448611,-0.849253,1.013829,0.166109,1,0,6
1,-1.0314,0.655613,-1.108839,-0.905237,-0.959515,0.849514,0.485654,-0.711281,1.468427,0.351832,0.394706,0,1,6
2,0.563072,-0.72109,0.662339,0.296646,-0.350928,-0.386538,-0.535598,0.876009,0.187107,0.541872,-0.209243,1,0,6
3,0.093554,-1.411915,-0.282528,0.6326,0.835623,0.888414,1.097446,1.261364,1.335977,-0.962507,-0.742932,0,1,6
4,0.563469,0.839495,2.774823,0.376294,0.611505,-2.049052,-0.902645,0.626106,0.011479,0.088964,0.313446,0,1,5


In [13]:
X, y = wine.ix[:,:-1], wine['quality']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [15]:
clf = LogisticRegression(penalty='l2', 
                         C=10,
                         n_jobs=-1)

In [16]:
clf.fit(X_train, y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
y_pred = clf.predict(X_test)

In [18]:
mean_absolute_error(y_test, y_pred)

0.50461538461538458

In [19]:
# accuracy
sum([yt == yp for yt, yp in zip(y_test, y_pred)])/len(y_test)

0.54538461538461536