## Importando os dados

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('train.csv', index_col=0)

In [3]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1169,7.6,0.500,0.29,2.3,0.086,5.0,14.0,0.99502,3.32,0.62,11.5,0
1510,6.4,0.360,0.21,2.2,0.047,26.0,48.0,0.99661,3.47,0.77,9.7,0
982,7.3,0.520,0.32,2.1,0.070,51.0,70.0,0.99418,3.34,0.82,12.9,0
1520,6.5,0.530,0.06,2.0,0.063,29.0,44.0,0.99489,3.38,0.83,10.3,0
283,8.9,0.400,0.32,5.6,0.087,10.0,47.0,0.99910,3.38,0.77,10.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...
997,5.6,0.660,0.00,2.2,0.087,3.0,11.0,0.99378,3.71,0.63,12.8,1
915,8.6,0.315,0.40,2.2,0.079,3.0,6.0,0.99512,3.27,0.67,11.9,0
640,9.9,0.540,0.45,2.3,0.071,16.0,40.0,0.99910,3.39,0.62,9.4,0
206,12.8,0.300,0.74,2.6,0.095,9.0,28.0,0.99940,3.20,0.77,10.8,1


In [4]:
counts = df.quality.value_counts(normalize=True)
counts

0    0.864164
1    0.135836
Name: quality, dtype: float64

In [5]:
X = df.drop(columns='quality')
y = df.loc[:, 'quality'].to_numpy()

X.shape, y.shape

((1119, 11), (1119,))

In [6]:
X.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1169,7.6,0.5,0.29,2.3,0.086,5.0,14.0,0.99502,3.32,0.62,11.5
1510,6.4,0.36,0.21,2.2,0.047,26.0,48.0,0.99661,3.47,0.77,9.7
982,7.3,0.52,0.32,2.1,0.07,51.0,70.0,0.99418,3.34,0.82,12.9
1520,6.5,0.53,0.06,2.0,0.063,29.0,44.0,0.99489,3.38,0.83,10.3
283,8.9,0.4,0.32,5.6,0.087,10.0,47.0,0.9991,3.38,0.77,10.5


## Normalização

In [7]:
from sklearn.preprocessing import StandardScaler

colunas = X.columns
scaler = StandardScaler()
X = scaler.fit_transform(X)

X = pd.DataFrame(X, columns = colunas)
X.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.415365,-0.151631,0.100549,-0.167723,-0.026156,-1.043078,-0.979891,-0.913689,0.054161,-0.214418,1.023866
1,-1.100984,-0.899807,-0.311216,-0.235588,-0.867953,0.965139,0.059516,-0.064509,1.019223,0.659551,-0.69162
2,-0.586769,-0.044749,0.254961,-0.303454,-0.371509,3.355874,0.732074,-1.362313,0.182836,0.950874,2.358133
3,-1.043849,0.008692,-1.083275,-0.371319,-0.522601,1.252027,-0.062767,-0.983119,0.440186,1.009138,-0.119791
4,0.32739,-0.686043,0.254961,2.071836,-0.004572,-0.564931,0.028945,1.265339,0.440186,0.659551,0.070819


## K-Fold Cross Validation

In [8]:
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=10)
kf.get_n_splits(X)

10

In [9]:
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from tqdm import tqdm
from sklearn.metrics import f1_score

scores = {
    'perceptron' : [],
    'random_forest' : [],
    'rfo_prob_0.40': [],
    'svm' : [],
    'rfo+svm': [],
    'rfo&svm': []
}

for train_index, test_index in tqdm(kf.split(X, y)):
    X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    
    # Perceptron
    perceptron = Perceptron(tol=1e-5)
    perceptron.fit(X_train, y_train)
    perceptron_y = perceptron.predict(X_test)
    p_f1 = f1_score(y_test, perceptron_y, average='macro')
    scores['perceptron'].append(p_f1)
    
    # Random Forest
    random_forest = RandomForestClassifier()
    random_forest.fit(X_train, y_train)
    random_forest_y = random_forest.predict(X_test)
    rfo_f1 = f1_score(y_test, random_forest_y, average='macro')
    scores['random_forest'].append(rfo_f1)
    
    # Random Forest for Prob '1'>0.40
    rfo_probs = random_forest.predict_proba(X_test)
    y_by_prob = [ 1 if y[1]>0.40 else 0 for y in rfo_probs]
    f1_proba = f1_score(y_test, y_by_prob, average='macro')
    scores['rfo_prob_0.40'].append(f1_proba)
    
    # SVM
    svm = SVC(gamma='auto')
    svm.fit(X_train, y_train)
    svm_y = svm.predict(X_test)
    svm_f1 = f1_score(y_test, svm_y, average='macro')
    scores['svm'].append(svm_f1)
    
    # RFO OR SVM
    rfo_or_svm = []
    for i in range(len(X_test)):
        if svm_y[i]==1 or random_forest_y[i]==1:
            rfo_or_svm.append(1)
        else:
            rfo_or_svm.append(0)
    
    rfo_or_svm_f1 = f1_score(y_test, rfo_or_svm, average='macro')
    scores['rfo+svm'].append(rfo_or_svm_f1)
    
    # RFO AND SVM
    rfo_and_svm = []
    for i in range(len(X_test)):
        if svm_y[i]==1 and random_forest_y[i]==1:
            rfo_and_svm.append(1)
        else:
            rfo_and_svm.append(0)
    
    rfo_and_svm_f1 = f1_score(y_test, rfo_and_svm, average='macro')
    scores['rfo&svm'].append(rfo_and_svm_f1)

10it [00:01,  5.01it/s]


In [16]:
rfo_probs

array([[0.96, 0.04],
       [0.99, 0.01],
       [0.78, 0.22],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.97, 0.03],
       [1.  , 0.  ],
       [0.99, 0.01],
       [0.92, 0.08],
       [0.96, 0.04],
       [0.97, 0.03],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.79, 0.21],
       [0.9 , 0.1 ],
       [0.88, 0.12],
       [0.56, 0.44],
       [0.96, 0.04],
       [0.99, 0.01],
       [0.88, 0.12],
       [0.98, 0.02],
       [0.97, 0.03],
       [0.62, 0.38],
       [1.  , 0.  ],
       [0.97, 0.03],
       [0.95, 0.05],
       [0.82, 0.18],
       [0.85, 0.15],
       [0.81, 0.19],
       [0.86, 0.14],
       [0.47, 0.53],
       [0.64, 0.36],
       [0.93, 0.07],
       [0.89, 0.11],
       [0.99, 0.01],
       [0.95, 0.05],
       [1.  , 0.  ],
       [0.25, 0.75],
       [0.33, 0.67],
       [0.2 , 0.8 ],
       [0.7 , 0.3 ],
       [0.42, 0.58],
       [0.98, 0.02],
       [0.97, 0.03],
       [0.64, 0.36],
       [1.  , 0.  ],
       [0.72, 0.28],
       [0.99,

In [10]:
scores

{'perceptron': [0.6830786644029428,
  0.599924797894341,
  0.495949594959496,
  0.7142857142857142,
  0.7305841924398626,
  0.5697399527186762,
  0.44554455445544555,
  0.5648795648795649,
  0.6276918629859806,
  0.5222381635581061],
 'random_forest': [0.6836970474967907,
  0.7635467980295567,
  0.900972590627763,
  0.6836970474967907,
  0.7335063811377893,
  0.7983798379837984,
  0.6954156954156954,
  0.8460481099656358,
  0.6623115577889447,
  0.842233502538071],
 'rfo_prob_0.40': [0.7983798379837984,
  0.7523618090452262,
  0.9111534190068222,
  0.72,
  0.72,
  0.7973869346733669,
  0.7029177718832891,
  0.7798941798941799,
  0.6841511562323745,
  0.781847418259782],
 'svm': [0.6261874197689346,
  0.6705882352941176,
  0.7666666666666666,
  0.5225916453537938,
  0.6584564860426929,
  0.6375404530744337,
  0.6850529958901146,
  0.7973869346733669,
  0.617286432160804,
  0.6153465346534653],
 'rfo+svm': [0.7110016420361248,
  0.781959766385464,
  0.9064014708340298,
  0.67058823529411

In [11]:
scores_mean = {}
for k, v in scores.items():
    media = sum(v) / len(v)
    scores_mean[k] = media

In [12]:
scores_mean

{'perceptron': 0.595391706258013,
 'random_forest': 0.7609808568480836,
 'rfo_prob_0.40': 0.7648092526978839,
 'svm': 0.659710380357839,
 'rfo+svm': 0.7673566488197103,
 'rfo&svm': 0.6454324132880142}

## Preparando o arquivo de submissão

In [None]:
!where pip

In [None]:
df_test = pd.read_csv('x_test.csv', index_col=0)

df_test.head(5)

In [None]:
colunas = df_test.columns
indices = df_test.index

X = scaler.transform(df_test)
X = pd.DataFrame(X, columns = colunas)
X.head(5)

In [None]:
y = model.predict(X)

In [None]:
solucao = {
    'Id':indices,
    'Predicted': y
}

df_solution = pd.DataFrame(solucao)
df_solution.head(5)

In [None]:
df_solution.to_csv('submission.csv', index=False)