## Importando os dados

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('train.csv', index_col=0)

In [3]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1169,7.6,0.500,0.29,2.3,0.086,5.0,14.0,0.99502,3.32,0.62,11.5,0
1510,6.4,0.360,0.21,2.2,0.047,26.0,48.0,0.99661,3.47,0.77,9.7,0
982,7.3,0.520,0.32,2.1,0.070,51.0,70.0,0.99418,3.34,0.82,12.9,0
1520,6.5,0.530,0.06,2.0,0.063,29.0,44.0,0.99489,3.38,0.83,10.3,0
283,8.9,0.400,0.32,5.6,0.087,10.0,47.0,0.99910,3.38,0.77,10.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...
997,5.6,0.660,0.00,2.2,0.087,3.0,11.0,0.99378,3.71,0.63,12.8,1
915,8.6,0.315,0.40,2.2,0.079,3.0,6.0,0.99512,3.27,0.67,11.9,0
640,9.9,0.540,0.45,2.3,0.071,16.0,40.0,0.99910,3.39,0.62,9.4,0
206,12.8,0.300,0.74,2.6,0.095,9.0,28.0,0.99940,3.20,0.77,10.8,1


In [4]:
counts = df.quality.value_counts(normalize=True)
counts

0    0.864164
1    0.135836
Name: quality, dtype: float64

In [5]:
X = df.drop(columns='quality')
y = df.loc[:, 'quality'].to_numpy()

X.shape, y.shape

((1119, 11), (1119,))

In [6]:
X.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1169,7.6,0.5,0.29,2.3,0.086,5.0,14.0,0.99502,3.32,0.62,11.5
1510,6.4,0.36,0.21,2.2,0.047,26.0,48.0,0.99661,3.47,0.77,9.7
982,7.3,0.52,0.32,2.1,0.07,51.0,70.0,0.99418,3.34,0.82,12.9
1520,6.5,0.53,0.06,2.0,0.063,29.0,44.0,0.99489,3.38,0.83,10.3
283,8.9,0.4,0.32,5.6,0.087,10.0,47.0,0.9991,3.38,0.77,10.5


## Normalização

In [7]:
from sklearn.preprocessing import StandardScaler

colunas = X.columns
scaler = StandardScaler()
X = scaler.fit_transform(X)

X = pd.DataFrame(X, columns = colunas)
X.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.415365,-0.151631,0.100549,-0.167723,-0.026156,-1.043078,-0.979891,-0.913689,0.054161,-0.214418,1.023866
1,-1.100984,-0.899807,-0.311216,-0.235588,-0.867953,0.965139,0.059516,-0.064509,1.019223,0.659551,-0.69162
2,-0.586769,-0.044749,0.254961,-0.303454,-0.371509,3.355874,0.732074,-1.362313,0.182836,0.950874,2.358133
3,-1.043849,0.008692,-1.083275,-0.371319,-0.522601,1.252027,-0.062767,-0.983119,0.440186,1.009138,-0.119791
4,0.32739,-0.686043,0.254961,2.071836,-0.004572,-0.564931,0.028945,1.265339,0.440186,0.659551,0.070819


## Separando entre conjuntos de Treino, Teste

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.20, random_state=0)
sss.get_n_splits(X, y)

5

In [9]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score
)

def eval_metrics(actual, predicted):
     return {
         'accuracy': accuracy_score(actual, predicted),
         'precision': precision_score(actual, predicted),
         'recall': recall_score(actual, predicted),
         'f1':f1_score(actual, predicted, average='micro')
     }

In [10]:
X.shape, y.shape

((1119, 11), (1119,))

In [11]:
from sklearn.linear_model import Perceptron
from tqdm import tqdm

perceptron_scores = []
perceptron_metrics = []

iteracao = 0
for train_index, test_index in tqdm(sss.split(X, y)):
    X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    
    model = Perceptron(tol=1e-3, random_state=0)
    model.fit(X_train, y_train)
    
    score = model.score(X_test, y_test)
    display(score)
    perceptron_scores.append(score)
    
    # Para mais metricas:
    y_pred = model.predict(X_test)
    metrics = eval_metrics(y_test, y_pred)
    perceptron_metrics.append({
        iteracao: metrics
    })
    
    iteracao+=1

0it [00:00, ?it/s]

0.8303571428571429

0.8303571428571429

0.7901785714285714

0.7901785714285714

0.7901785714285714

5it [00:00, 125.01it/s]


In [12]:
perceptron_metrics

[{0: {'accuracy': 0.8303571428571429,
   'precision': 0.2777777777777778,
   'recall': 0.16666666666666666,
   'f1': 0.8303571428571429}},
 {1: {'accuracy': 0.8303571428571429,
   'precision': 0.39473684210526316,
   'recall': 0.5,
   'f1': 0.8303571428571429}},
 {2: {'accuracy': 0.7901785714285714,
   'precision': 0.16,
   'recall': 0.13333333333333333,
   'f1': 0.7901785714285714}},
 {3: {'accuracy': 0.7901785714285714,
   'precision': 0.24242424242424243,
   'recall': 0.26666666666666666,
   'f1': 0.7901785714285714}},
 {4: {'accuracy': 0.7901785714285714,
   'precision': 0.24242424242424243,
   'recall': 0.26666666666666666,
   'f1': 0.7901785714285714}}]

## Preparando o arquivo de submissão

In [13]:
df_test = pd.read_csv('x_test.csv', index_col=0)

df_test.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
729,6.4,0.865,0.03,3.2,0.071,27.0,58.0,0.995,3.61,0.49,12.7
820,7.0,0.685,0.0,1.9,0.099,9.0,22.0,0.99606,3.34,0.6,9.7
1140,7.3,0.4,0.3,1.7,0.08,33.0,79.0,0.9969,3.41,0.65,9.5
1269,5.5,0.49,0.03,1.8,0.044,28.0,87.0,0.9908,3.5,0.82,14.0
656,10.7,0.43,0.39,2.2,0.106,8.0,32.0,0.9986,2.89,0.5,9.6


In [14]:
colunas = df_test.columns
indices = df_test.index

X = scaler.transform(df_test)
X = pd.DataFrame(X, columns = colunas)
X.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-1.107633,2.135931,-1.236864,0.545662,-0.350424,1.07124,0.317598,-0.92938,1.976724,-1.043036,2.072254
1,-0.758313,1.000028,-1.390089,-0.496557,0.224701,-0.650236,-0.764546,-0.377086,0.197549,-0.373219,-0.650416
2,-0.583654,-0.798484,0.142159,-0.656899,-0.165562,1.645066,0.948849,0.060581,0.658817,-0.068758,-0.831927
3,-1.631611,-0.230533,-1.236864,-0.576728,-0.905008,1.166878,1.189326,-3.117713,1.251875,0.966413,3.252078
4,1.395821,-0.609167,0.601833,-0.256045,0.368482,-0.745873,-0.463951,0.946335,-2.767743,-0.982143,-0.741171


In [15]:
y = model.predict(X)

In [16]:
solucao = {
    'Id':indices,
    'Predicted': y
}

df_solution = pd.DataFrame(solucao)
df_solution.head(5)

Unnamed: 0,Id,Predicted
0,729,0
1,820,0
2,1140,0
3,1269,1
4,656,0


In [17]:
df_solution.to_csv('submission.csv', index=False)