## Importando os dados

In [18]:
import pandas as pd

In [19]:
df = pd.read_csv('train.csv', index_col=0)

In [20]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
150,7.3,0.330,0.47,2.1,0.077,5.0,11.0,0.99580,3.33,0.53,10.30,0
1164,9.0,0.785,0.24,1.7,0.078,10.0,21.0,0.99692,3.29,0.67,10.00,0
285,9.9,0.590,0.07,3.4,0.102,32.0,71.0,1.00015,3.31,0.71,9.80,0
1456,6.0,0.540,0.06,1.8,0.050,38.0,89.0,0.99236,3.30,0.50,10.55,0
1389,6.7,0.480,0.02,2.2,0.080,36.0,111.0,0.99524,3.10,0.53,9.70,0
...,...,...,...,...,...,...,...,...,...,...,...,...
174,7.3,0.380,0.21,2.0,0.080,7.0,35.0,0.99610,3.33,0.47,9.50,0
915,8.6,0.315,0.40,2.2,0.079,3.0,6.0,0.99512,3.27,0.67,11.90,0
640,9.9,0.540,0.45,2.3,0.071,16.0,40.0,0.99910,3.39,0.62,9.40,0
661,7.5,0.420,0.31,1.6,0.080,15.0,42.0,0.99780,3.31,0.64,9.00,0


In [21]:
counts = df.quality.value_counts(normalize=True)
counts

0    0.864606
1    0.135394
Name: quality, dtype: float64

In [22]:
X = df.drop(columns='quality')
y = df.loc[:, 'quality'].to_numpy()

X.shape, y.shape

((1359, 11), (1359,))

In [23]:
X.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
150,7.3,0.33,0.47,2.1,0.077,5.0,11.0,0.9958,3.33,0.53,10.3
1164,9.0,0.785,0.24,1.7,0.078,10.0,21.0,0.99692,3.29,0.67,10.0
285,9.9,0.59,0.07,3.4,0.102,32.0,71.0,1.00015,3.31,0.71,9.8
1456,6.0,0.54,0.06,1.8,0.05,38.0,89.0,0.99236,3.3,0.5,10.55
1389,6.7,0.48,0.02,2.2,0.08,36.0,111.0,0.99524,3.1,0.53,9.7


## Normalização

In [24]:
from sklearn.preprocessing import StandardScaler

colunas = X.columns
scaler = StandardScaler()
X = scaler.fit_transform(X)

X = pd.DataFrame(X, columns = colunas)
X.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.596546,-1.092653,1.015845,-0.301099,-0.230762,-1.04957,-1.065063,-0.491006,0.137108,-0.75597,-0.127628
1,0.380624,1.410538,-0.163572,-0.587273,-0.209644,-0.564081,-0.763149,0.105867,-0.123038,0.070454,-0.407881
2,0.897949,0.337742,-1.035315,0.628967,0.297194,1.572068,0.746421,1.827207,0.007035,0.306575,-0.594717
3,-1.343793,0.062666,-1.086594,-0.515729,-0.800955,2.154655,1.289866,-2.32426,-0.058002,-0.933061,0.105917
4,-0.941429,-0.267425,-1.29171,-0.229555,-0.167407,1.960459,1.954077,-0.789443,-1.358733,-0.75597,-0.688135


## Separando entre conjuntos de Treino, Teste

In [25]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.20, random_state=0)
sss.get_n_splits(X, y)

5

In [26]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score
)

def eval_metrics(actual, predicted):
     return {
         'accuracy': accuracy_score(actual, predicted),
         'precision': precision_score(actual, predicted),
         'recall': recall_score(actual, predicted),
         'f1':f1_score(actual, predicted, average='micro')
     }

In [27]:
X.shape, y.shape

((1359, 11), (1359,))

In [28]:
from sklearn.linear_model import Perceptron
from tqdm import tqdm

perceptron_scores = []
perceptron_metrics = []

iteracao = 0
for train_index, test_index in tqdm(sss.split(X, y)):
    X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    
    model = Perceptron(tol=1e-3, random_state=0)
    model.fit(X_train, y_train)
    
    score = model.score(X_test, y_test)
    display(score)
    perceptron_scores.append(score)
    
    # Para mais metricas:
    y_pred = model.predict(X_test)
    metrics = eval_metrics(y_test, y_pred)
    perceptron_metrics.append({
        iteracao: metrics
    })
    
    iteracao+=1

0it [00:00, ?it/s]

0.7867647058823529

0.8639705882352942

0.8529411764705882

0.8382352941176471

0.8602941176470589

5it [00:00, 119.05it/s]


In [29]:
perceptron_metrics

[{0: {'accuracy': 0.7867647058823529,
   'precision': 0.36,
   'recall': 0.7297297297297297,
   'f1': 0.7867647058823529}},
 {1: {'accuracy': 0.8639705882352942,
   'precision': 0.5,
   'recall': 0.10810810810810811,
   'f1': 0.8639705882352942}},
 {2: {'accuracy': 0.8529411764705882,
   'precision': 0.44,
   'recall': 0.2972972972972973,
   'f1': 0.8529411764705882}},
 {3: {'accuracy': 0.8382352941176471,
   'precision': 0.18181818181818182,
   'recall': 0.05405405405405406,
   'f1': 0.8382352941176471}},
 {4: {'accuracy': 0.8602941176470589,
   'precision': 0.47368421052631576,
   'recall': 0.24324324324324326,
   'f1': 0.8602941176470589}}]

## Preparando o arquivo de submissão

In [30]:
df_test = pd.read_csv('x_test.csv', index_col=0)

df_test.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
177,7.5,0.52,0.42,2.3,0.087,8.0,38.0,0.9972,3.58,0.61,10.5
384,7.7,0.51,0.28,2.1,0.087,23.0,54.0,0.998,3.42,0.74,9.2
1324,6.7,0.46,0.24,1.7,0.077,18.0,34.0,0.9948,3.39,0.6,10.6
1076,9.9,0.32,0.56,2.0,0.073,3.0,8.0,0.99534,3.15,0.73,11.4
927,8.4,0.67,0.19,2.2,0.093,11.0,75.0,0.99736,3.2,0.59,9.2


In [31]:
colunas = df_test.columns
indices = df_test.index

X = scaler.fit_transform(df_test)
X = pd.DataFrame(X, columns = colunas)
X.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.411479,-0.020655,0.798754,-0.231567,0.047366,-0.729542,-0.303226,0.159911,1.63353,-0.286479,0.149389
1,-0.296647,-0.082233,0.073791,-0.367616,0.047366,0.597569,0.204908,0.572751,0.614398,0.479429,-1.109755
2,-0.870803,-0.390127,-0.133341,-0.639713,-0.173799,0.155198,-0.43026,-1.07861,0.423311,-0.345395,0.246246
3,0.966496,-1.252228,1.523716,-0.43564,-0.262265,-1.171913,-1.255977,-0.799943,-1.105386,0.420513,1.021104
4,0.105262,0.903026,-0.392257,-0.299591,0.180066,-0.46412,0.871833,0.242479,-0.786908,-0.404311,-1.109755


In [32]:
y = model.predict(X)

In [33]:
solucao = {
    'Id':indices,
    'Predicted': y
}

df_solution = pd.DataFrame(solucao)
df_solution.head(5)

Unnamed: 0,Id,Predicted
0,177,0
1,384,0
2,1324,0
3,1076,0
4,927,0


In [34]:
df_solution.to_csv('submission.csv', index=False)