## Importando os dados

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('train.csv', index_col=0)

In [3]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1169,7.6,0.500,0.29,2.3,0.086,5.0,14.0,0.99502,3.32,0.62,11.5,0
1510,6.4,0.360,0.21,2.2,0.047,26.0,48.0,0.99661,3.47,0.77,9.7,0
982,7.3,0.520,0.32,2.1,0.070,51.0,70.0,0.99418,3.34,0.82,12.9,0
1520,6.5,0.530,0.06,2.0,0.063,29.0,44.0,0.99489,3.38,0.83,10.3,0
283,8.9,0.400,0.32,5.6,0.087,10.0,47.0,0.99910,3.38,0.77,10.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...
997,5.6,0.660,0.00,2.2,0.087,3.0,11.0,0.99378,3.71,0.63,12.8,1
915,8.6,0.315,0.40,2.2,0.079,3.0,6.0,0.99512,3.27,0.67,11.9,0
640,9.9,0.540,0.45,2.3,0.071,16.0,40.0,0.99910,3.39,0.62,9.4,0
206,12.8,0.300,0.74,2.6,0.095,9.0,28.0,0.99940,3.20,0.77,10.8,1


In [4]:
counts = df.quality.value_counts(normalize=True)
counts

0    0.864164
1    0.135836
Name: quality, dtype: float64

In [5]:
X = df.drop(columns='quality')
y = df.loc[:, 'quality'].to_numpy()

X.shape, y.shape

((1119, 11), (1119,))

In [6]:
y_train_2 = y.copy()

In [7]:
X.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1169,7.6,0.5,0.29,2.3,0.086,5.0,14.0,0.99502,3.32,0.62,11.5
1510,6.4,0.36,0.21,2.2,0.047,26.0,48.0,0.99661,3.47,0.77,9.7
982,7.3,0.52,0.32,2.1,0.07,51.0,70.0,0.99418,3.34,0.82,12.9
1520,6.5,0.53,0.06,2.0,0.063,29.0,44.0,0.99489,3.38,0.83,10.3
283,8.9,0.4,0.32,5.6,0.087,10.0,47.0,0.9991,3.38,0.77,10.5


## K-Fold Cross Validation

In [8]:
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=10)
kf.get_n_splits(X)

10

In [9]:
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

from tqdm import tqdm
from sklearn.metrics import f1_score

scores = {
    'perceptron' : [],
    'random_forest' : [],
    'rfo_prob_0.40': [],
    'svm' : [],
    'rfo+svm': [],
    'rfo&svm': []
}

for train_index, test_index in tqdm(kf.split(X, y)):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Perceptron
    perceptron = Perceptron(tol=1e-5)
    perceptron.fit(X_train, y_train)
    perceptron_y = perceptron.predict(X_test)
    p_f1 = f1_score(y_test, perceptron_y, average='macro')
    scores['perceptron'].append(p_f1)
    
    # Random Forest
    random_forest = RandomForestClassifier()
    random_forest.fit(X_train, y_train)
    random_forest_y = random_forest.predict(X_test)
    rfo_f1 = f1_score(y_test, random_forest_y, average='macro')
    scores['random_forest'].append(rfo_f1)
    
    # Random Forest for Prob '1'>0.40
    rfo_probs = random_forest.predict_proba(X_test)
    y_by_prob = [ 1 if y[1]>0.40 else 0 for y in rfo_probs]
    f1_proba = f1_score(y_test, y_by_prob, average='macro')
    scores['rfo_prob_0.40'].append(f1_proba)
    
    # SVM
    svm = SVC(gamma='auto')
    svm.fit(X_train, y_train)
    svm_y = svm.predict(X_test)
    svm_f1 = f1_score(y_test, svm_y, average='macro')
    scores['svm'].append(svm_f1)
    
    # RFO OR SVM
    rfo_or_svm = []
    for i in range(len(X_test)):
        if svm_y[i]==1 or random_forest_y[i]==1:
            rfo_or_svm.append(1)
        else:
            rfo_or_svm.append(0)
    
    rfo_or_svm_f1 = f1_score(y_test, rfo_or_svm, average='macro')
    scores['rfo+svm'].append(rfo_or_svm_f1)
    
    # RFO AND SVM
    rfo_and_svm = []
    for i in range(len(X_test)):
        if svm_y[i]==1 and random_forest_y[i]==1:
            rfo_and_svm.append(1)
        else:
            rfo_and_svm.append(0)
    
    rfo_and_svm_f1 = f1_score(y_test, rfo_and_svm, average='macro')
    scores['rfo&svm'].append(rfo_and_svm_f1)

10it [00:02,  3.78it/s]


In [10]:
rfo_probs

array([[0.99, 0.01],
       [0.99, 0.01],
       [0.75, 0.25],
       [1.  , 0.  ],
       [0.99, 0.01],
       [0.94, 0.06],
       [1.  , 0.  ],
       [0.98, 0.02],
       [0.9 , 0.1 ],
       [0.96, 0.04],
       [0.98, 0.02],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.78, 0.22],
       [0.94, 0.06],
       [0.79, 0.21],
       [0.63, 0.37],
       [0.93, 0.07],
       [0.99, 0.01],
       [0.84, 0.16],
       [0.99, 0.01],
       [0.98, 0.02],
       [0.53, 0.47],
       [1.  , 0.  ],
       [0.95, 0.05],
       [0.93, 0.07],
       [0.87, 0.13],
       [0.89, 0.11],
       [0.72, 0.28],
       [0.86, 0.14],
       [0.55, 0.45],
       [0.75, 0.25],
       [0.91, 0.09],
       [0.93, 0.07],
       [1.  , 0.  ],
       [0.96, 0.04],
       [1.  , 0.  ],
       [0.2 , 0.8 ],
       [0.28, 0.72],
       [0.25, 0.75],
       [0.69, 0.31],
       [0.33, 0.67],
       [0.97, 0.03],
       [0.97, 0.03],
       [0.66, 0.34],
       [0.99, 0.01],
       [0.73, 0.27],
       [0.97,

In [11]:
scores

{'perceptron': [0.6830786644029428,
  0.599924797894341,
  0.495949594959496,
  0.7142857142857142,
  0.7305841924398626,
  0.5839009287925697,
  0.44554455445544555,
  0.5648795648795649,
  0.6276918629859806,
  0.5222381635581061],
 'random_forest': [0.7254901960784313,
  0.7635467980295567,
  0.900972590627763,
  0.6375404530744337,
  0.7335063811377893,
  0.7983798379837984,
  0.7666666666666666,
  0.7690721649484537,
  0.6623115577889447,
  0.782182103610675],
 'rfo_prob_0.40': [0.747974797479748,
  0.7073366834170854,
  0.9306501547987616,
  0.6975697569756976,
  0.6954156954156954,
  0.7824397824397824,
  0.7690721649484535,
  0.8045375218150088,
  0.6633068081343944,
  0.7956553755522828],
 'svm': [0.6261874197689346,
  0.6261874197689346,
  0.7666666666666666,
  0.5225916453537938,
  0.6584564860426929,
  0.6375404530744337,
  0.6850529958901146,
  0.7523618090452262,
  0.617286432160804,
  0.6153465346534653],
 'rfo+svm': [0.7110016420361248,
  0.747974797479748,
  0.88453608

In [12]:
scores_mean = {}
for k, v in scores.items():
    media = sum(v) / len(v)
    scores_mean[k] = media

In [13]:
scores_mean

{'perceptron': 0.5968078038654024,
 'random_forest': 0.7539668749946513,
 'rfo_prob_0.40': 0.7593958740976909,
 'svm': 0.6507677862425066,
 'rfo+svm': 0.7545997514810618,
 'rfo&svm': 0.6451688029791827}

## Preparando o arquivo de submissão

In [14]:
!where pip

C:\Users\samir.junior\Anaconda3\envs\curso_dados\Scripts\pip.exe
C:\Users\samir.junior\Anaconda3\Scripts\pip.exe
C:\Users\samir.junior\AppData\Local\Microsoft\WindowsApps\pip.exe


In [15]:
df_test = pd.read_csv('x_test.csv', index_col=0)

df_test.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
729,6.4,0.865,0.03,3.2,0.071,27.0,58.0,0.995,3.61,0.49,12.7
820,7.0,0.685,0.0,1.9,0.099,9.0,22.0,0.99606,3.34,0.6,9.7
1140,7.3,0.4,0.3,1.7,0.08,33.0,79.0,0.9969,3.41,0.65,9.5
1269,5.5,0.49,0.03,1.8,0.044,28.0,87.0,0.9908,3.5,0.82,14.0
656,10.7,0.43,0.39,2.2,0.106,8.0,32.0,0.9986,2.89,0.5,9.6


In [16]:
colunas = df_test.columns
indices = df_test.index

### Treinando o modelo com todos os dados de train

In [17]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X)
X_test = scaler.transform(df_test)

random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train_2)
rfo_y_submission = random_forest.predict(X_test)

In [18]:
y = rfo_y_submission

In [19]:
solucao = {
    'Id':indices,
    'Predicted': y
}

df_solution = pd.DataFrame(solucao)
df_solution.head(5)

Unnamed: 0,Id,Predicted
0,729,0
1,820,0
2,1140,0
3,1269,0
4,656,0


In [20]:
df_solution.to_csv('submission.csv', index=False)