# Aprendendo a usar o Kaggle

Olá, seja bem vindo ao Workshop de classificadores de ML do Iris Data Science. Nós preparamos esse notebook para que você possa aprender do 0 até conquistar 

In [None]:
import matplotlib.pyplot as matplotlib
import seaborn
import pandas
import numpy
from sklearn.metrics import classification_report
%matplotlib inline

def reshape(list1D):
     return numpy.array(list1D).reshape(-1,1)
    
def plot_ours(model):
    x = numpy.linspace(0,1,50)
    y = model.predict(reshape(x))
    matplotlib.figure(figsize=(4,4))
    matplotlib.plot(x, y, color="red")
    matplotlib.suptitle('Our Logistic model')
    matplotlib.xlabel('x')
    matplotlib.ylabel('y')
    
def plot_lr():
    logistical = lambda x: numpy.exp(x)/(1+numpy.exp(x))   
    x = numpy.linspace(-10,10,50)
    y = logistical(x)
    matplotlib.figure(figsize=(4,4))
    matplotlib.plot(x, y, color="red")
    matplotlib.suptitle('Logisitc Regression model')
    matplotlib.xlabel('x')
    matplotlib.ylabel('y')

plot_lr()
# Apresentar o que é um notebook a = 1, ...

In [None]:
from sklearn.linear_model import LogisticRegression

x = [0.0, 0.01, 0.1, 0.123, 0.12345, 0.234, 0.432, 0.6535, 0.6457457, 0.7, 0.71, 0.07, 0.8, 0.9, 1]
y = [0  , 0   , 0  , 0    , 0      , 0    , 0    , 1     , 1        , 1  , 1   , 0   , 1  , 1  , 1]

model = LogisticRegression()
model.fit(reshape(x),y)

In [None]:
test = [0.001431256, 0.136882, 0.6345345345, 0.81234791874, -1.11242452]

result = model.predict(reshape(test))
print(result)
plot_ours(model)

**Complicando um pouco:** Um dataset de \[0,1\] com ~10% das respostas erradas!

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
import random

def create_dataset():
    x = [random.random() for i in range(10000)]
    classify = lambda i: int(i > 0.5) if random.random() > 0.1 else int(not i > 0.5)
    dataset = pandas.DataFrame(x,columns=['x'])
    dataset['y'] = dataset['x'].apply(classify)
    return dataset 
    
dataset = create_dataset()
dataset

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

x = dataset['x']
y = dataset['y']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
model = LogisticRegression()
model.fit(reshape(x_train),y_train)

result = model.predict(reshape(x_test))
print(accuracy_score(result, y_test))
print(confusion_matrix(result, y_test))

# Mexendo com dados reais


### MNIST - Digit Recognizer
Dataset com dígitos escritos à mão e seus respectivos valores

#### Estrutura

Cada linha dos datasets (tanto de treino quanto de teste) está estruturada da seguinte forma:


| Digito representado | pixel 1x1 | ... | pixel 28x28 |
|:-----------------:|:---------:|:---:|:----------:|
|5|0|...|0|

Como temos uma imagem 28x28 temos 784 valores de pixel por coluna, todos valores binários:

In [None]:
mnist_train = pandas.read_csv("../input/mnist-in-csv/mnist_train.csv")
mnist_test = pandas.read_csv("../input/mnist-in-csv/mnist_test.csv")
mnist_train.head()

Conseguimos ver como é a imagem redimensionando o a matriz `1x784` para uma `28x28`:

In [None]:
def plt_digit_from_row(row):
    label, image = mnist_train.values[row,0], mnist_train.values[row,1:]
    matplotlib.imshow(image.reshape(28,28), cmap='hot')
    matplotlib.title("Label: %s"%label)
    matplotlib.show()

In [None]:
plt_digit_from_row(0)

In [None]:
mnist_train_labels, mnist_train_values = mnist_train.values[:,0], mnist_train.values[:,1:]
mnist_test_labels, mnist_test_values = mnist_test.values[:,0], mnist_test.values[:,1:]

In [None]:
from sklearn.preprocessing import StandardScaler
model = LogisticRegression()

model.fit(mnist_train_values, mnist_train_labels)


prediction = model.predict(mnist_test_values)

print(classification_report(prediction, mnist_test_labels))

### Fashion MNIST
Dataset com desenhos de tipos de roupa classificadas com labels

#### Estrutura

Cada linha dos datasets (tanto de treino quanto de teste) está estruturada da seguinte forma:


| Label de cada roupa | pixel 1x1 | ... | pixel 28x28 |
|:-----------------:|:---------:|:---:|:----------:|
|5|0|...|0|

Como temos uma imagem 28x28 temos 784 valores de pixel por coluna, todos valores binários:

In [None]:
fashion_mnist_train, fashion_mnist_test = pandas.read_csv("../input/fashionmnist/fashion-mnist_train.csv"), pandas.read_csv("../input/fashionmnist/fashion-mnist_test.csv")
fashion_mnist_train.head()

In [None]:
def plt_clothes_from_row(row):
    label, image = fashion_mnist_train.values[row,0], fashion_mnist_train.values[row,1:]
    matplotlib.imshow(image.reshape(28,28), cmap='gray')
    matplotlib.title("Label: %s"%label)
    matplotlib.show()

In [None]:
plt_clothes_from_row(0)

In [None]:
fashion_mnist_train_labels, fashion_mnist_train_values = fashion_mnist_train.values[:,0], fashion_mnist_train.values[:,1:]
fashion_mnist_test_labels, fashion_mnist_test_values = fashion_mnist_test.values[:,0], fashion_mnist_test.values[:,1:]

In [None]:
model = LogisticRegression()

model.fit(fashion_mnist_train_values, fashion_mnist_train_labels)

prediction = model.predict(fashion_mnist_test_values)

print(classification_report(prediction, mnist_test_labels))