### dane nieustruktyryzowane

Dane nieustrukturyzowane to dane, które nie są w żaden sposób uporządkowane, takie jak:

- obrazy,
- teksty,
- dźwięki,
- wideo.
  
Niezależnie od typu, wszystko przetwarzamy w tensorach (macierzach wielowymiarowych). To może prowadzić do chęci wykorzystania modeli ML i sieci neuronowych do analizy danych nieustrukturyzowanych.

![](data.png)

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid", palette="husl")


# 2-dim picture 28 x 28 pixel
picture_2d = np.random.uniform(size=(28,28))
picture_2d[0:5,0:5]

In [None]:
plt.imshow(picture_2d, interpolation='nearest')
plt.show()

# pretrenowane modele klasyfikujące

In [None]:
import urllib.request
url = 'https://pytorch.tips/coffee'
fpath = 'coffee.jpg'
# pobierz na dysk
urllib.request.urlretrieve(url, fpath)

In [None]:
import matplotlib.pyplot as plt
from PIL import Image # pillow library

In [None]:
img = Image.open('coffee.jpg')
plt.imshow(img)

In [None]:
# !pip install torchvision==0.15.2

In [None]:
import torch
from torchvision import transforms

In [None]:
from torchvision import models

models.list_models()[:5]

Odrobinę zmienimy własności obrazka 

In [None]:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize( 
    mean = [0.485, 0.456, 0.406],
    std = [0.229, 0.224,0.225])
])

In [None]:
img_tensor = transform(img)

Sprawdzmy rozmiary


In [None]:
type(img_tensor), img_tensor.shape

In [None]:
# utworzenie batch size - dodatkowego wymiaru (na inne obrazki)
batch = img_tensor.unsqueeze(0)
batch.shape

Załadujmy gotowy model 

In [None]:
alexnet = models.alexnet(pretrained=True)

In [None]:
# alexnet

In [None]:
alexnet.eval()
predict = alexnet(batch)

In [None]:
_, idx = torch.max(predict,1)

In [None]:
print(idx)

In [None]:
url = 'https://pytorch.tips/imagenet-labels'
fpath = 'imagenet_class_labels.txt'
urllib.request.urlretrieve(url, fpath)

In [None]:
with open('imagenet_class_labels.txt') as f:
    classes = [line.strip() for line in f.readlines()]

In [None]:
classes[0:5]

In [None]:
prob = torch.nn.functional.softmax(predict, dim=1)[0] *100
prob[:10]

In [None]:
classes[idx.item()], prob[idx.item()].item()

In [None]:
resnet = models.resnet101(weights=models.ResNet101_Weights.DEFAULT)

In [None]:
# resnet

In [None]:
resnet.eval()
out = resnet(batch)

In [None]:
_, index = torch.max(out,1)
prob = torch.nn.functional.softmax(out, dim=1)[0] *100

In [None]:
classes[index.item()], prob[index.item()].item()

### jeszcze obrazki 

In [None]:
# 60000 obrazow 28x28

# Loading the Fashion-MNIST dataset
from torchvision import datasets, transforms
# transformacja i normalizacja danych 
transform = transforms.Compose([transforms.ToTensor(),
  transforms.Normalize((0.5,), (0.5,))
])

# Download and load the training data
trainset = datasets.FashionMNIST('MNIST_data/', download = True, train = True, transform = transform)
testset = datasets.FashionMNIST('MNIST_data/', download = True, train = False, transform = transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size = 64, shuffle = True)
testloader = torch.utils.data.DataLoader(testset, batch_size = 64, shuffle = True)


In [None]:
dataiter = iter(trainloader)
images, labels = next(dataiter)

In [None]:
indexes = np.random.randint(0, images.shape[0], size=25)
images_rand = images[indexes]
plt.figure(figsize=(5,5))
for i in range(25):
    plt.subplot(5, 5, i+1)
    image = images_rand[i]
    plt.imshow(image[0])
    plt.axis('off')

plt.show()
plt.close('all')

Przykładowy model sieci nueronowej (bez konwolucji) - czy sądzisz, że to dobre rozwiązanie? 

In [None]:
# Define the network architecture
from torch import nn, optim
import torch.nn.functional as F

model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 10),
                      nn.LogSoftmax(dim = 1)
                     )

# Define the loss
criterion = nn.NLLLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr = 0.002)

# Define the epochs
epochs = 30

train_losses, test_losses = [], []

for e in range(epochs):
  running_loss = 0
  for images, labels in trainloader:
    # Flatten Fashion-MNIST images into a 784 long vector
    images = images.view(images.shape[0], -1)
    
    # Training pass
    optimizer.zero_grad()
    
    output = model.forward(images)
    loss = criterion(output, labels)
    loss.backward()
    optimizer.step()
    
    running_loss += loss.item()
  else:
    test_loss = 0
    accuracy = 0
    
    # Turn off gradients for validation, saves memory and computation
    with torch.no_grad():
      # Set the model to evaluation mode
      model.eval()
      
      # Validation pass
      for images, labels in testloader:
        images = images.view(images.shape[0], -1)
        log_ps = model(images)
        test_loss += criterion(log_ps, labels)
        
        ps = torch.exp(log_ps)
        top_p, top_class = ps.topk(1, dim = 1)
        equals = top_class == labels.view(*top_class.shape)
        accuracy += torch.mean(equals.type(torch.FloatTensor))
    
    model.train()
    train_losses.append(running_loss/len(trainloader))
    test_losses.append(test_loss/len(testloader))
    
    print("Epoch: {}/{}..".format(e+1, epochs),
          "Training loss: {:.3f}..".format(running_loss/len(trainloader)),
          "Test loss: {:.3f}..".format(test_loss/len(testloader)),
          "Test Accuracy: {:.3f}".format(accuracy/len(testloader)))

In [None]:
plt.plot(train_losses, label = "Training loss")
plt.plot(test_losses, label = "Validation loss")
plt.legend(frameon = False)

![](wykres0.png)

![](wykres1.png)

In [None]:
print("My model: \n\n", model, "\n")
print("The state dict keys: \n\n", model.state_dict().keys())

In [None]:
torch.save(model.state_dict(), 'checkpoint.pth')

A jakie inne sieci i warstwy możemy wykorzystać do analizy danych nieustrukturyzowanych? 

> Znajdź odpowiedź na to pytanie w dokumentacji biblioteki Keras.

## tekst

In [None]:
import pandas as pd
df_train = pd.read_csv("train.csv")
df_train = df_train.drop("index", axis=1)
print(df_train.head())
print(np.bincount(df_train["label"]))

In [None]:
# BoW model  - wektoryzator z sklearn
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(lowercase=True, max_features=10_000, stop_words="english")

cv.fit(df_train["text"])

In [None]:
# słownik i nasze zmienne ..
cv.vocabulary_

In [None]:
X_train = cv.transform(df_train["text"])

In [None]:
# to dense matrix
feat_vec = np.array(X_train[0].todense())[0]
print(feat_vec.shape)
np.bincount(feat_vec)

## Obiektowe podejście do modelowania

In [None]:
import pandas as pd
import numpy as np
 
# przykład danych ustrukturyzowanych
df = pd.read_csv("students.csv")
df.head()

In [None]:
len(df), list(df.columns)

In [None]:
X = df.drop(columns=['target'])
y = df['target']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# ZAMIAST OD RAZU PRZETWARZAC !!! najpierw przygotuj kroki - pipeline

numeric_features = ['math score','reading score','writing score']
categorical_features = ['sex','race/ethnicity','parental level of education','lunch','test preparation course']

In [None]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ("num_trans", numeric_transformer, numeric_features),
    ("cat_trans", categorical_transformer, categorical_features)
])

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(steps=[
    ("preproc", preprocessor),
    ("model", LogisticRegression())
])

In [None]:
from sklearn import set_config
set_config(display='diagram')
pipeline

> PAMIETAJ - obiekt pipeline to obiekt pythonowy i tak jak obiekt modelu można go zapisać do pickla. 

In [None]:
from sklearn.model_selection import train_test_split
X_tr, X_test, y_tr, y_test = train_test_split(X,y,
test_size=0.2, random_state=42)

pipeline.fit(X_tr, y_tr)

score = pipeline.score(X_test, y_test)
print(score)

In [None]:
import joblib
joblib.dump(pipeline, 'your_pipeline.pkl')

TU ZACZYNA SIĘ MAGIA OBIEKTOWEGO PYTHONA - nie pisz kodu i nie uruchamiaj kodów wiele razy dla różnych parametrów - niech Python zrobi to za Ciebie 

In [None]:
param_grid = [
              {"preproc__num_trans__imputer__strategy":
              ["mean","median"],
               "model__n_estimators":[2,5,10,100,500],
               "model__min_samples_leaf": [1, 0.1],
               "model":[RandomForestClassifier()]},
              {"preproc__num_trans__imputer__strategy":
                ["mean","median"],
               "model__C":[0.1,1.0,10.0,100.0,1000],
                "model":[LogisticRegression()]}
]

from sklearn.model_selection import GridSearchCV


grid_search = GridSearchCV(pipeline, param_grid,
cv=2, verbose=1, n_jobs=-1)


grid_search.fit(X_tr, y_tr)

grid_search.best_params_

In [None]:
grid_search.score(X_test, y_test), grid_search.score(X_tr, y_tr)

Teraz drobna modyfikacja - wiemy, że takiej zmiennej nie chcemy do modelu - ma tylko jedną wartość. 
Ale jak zweryfikować jakie to zmienne jeśli masz 3 mln kolumn? 


In [None]:
df['bad_feature'] = 1

In [None]:
X = df.drop(columns=['target'])
y = df['target']
X_tr, X_test, y_tr, y_test = train_test_split(X,y,
test_size=0.2, random_state=42)

In [None]:
numeric_features = ['math score','reading score','writing score', 'bad_feature']
# znajdz sposób na automatyczny podział dla zmiennych numerycznych i nienumerycznych

In [None]:
grid_search = GridSearchCV(pipeline, param_grid,
cv=2, verbose=1, n_jobs=-1)

grid_search.fit(X_tr, y_tr)

grid_search.best_params_

In [None]:
grid_search.score(X_tr, y_tr), grid_search.score(X_test, y_test)

### NAPISZ WŁASNĄ KLASĘ KTÓRA ZREALIZUJE TRNSFORMACJE ZA CIEBIE

In [None]:
# your own transformator class

from sklearn.base import BaseEstimator, TransformerMixin

class DelOneValueFeature(BaseEstimator, TransformerMixin):
    """Description"""
    def __init__(self):
        self.one_value_features = []
        
    def fit(self, X, y=None):
        for feature in X.columns:
            unique = X[feature].unique()
            if len(unique)==1:
                self.one_value_features.append(feature)
        return self
    def transform(self, X, y=None):
        if not self.one_value_features:
            return X
        return X.drop(axis='columns', columns=self.one_value_features)

In [None]:
# UTWÓRZ NOWY PIPELINE
pipeline2 = Pipeline([
    ("moja_transformacja",DelOneValueFeature()),
    ("preprocesser", preprocessor),
    ("classifier", LogisticRegression())])
    
pipeline2.fit(X_tr, y_tr)
score2 = pipeline2.score(X_test, y_test)