### Unstructured data

Unstructured data refers to data that is not organized in any way, such as:

- images,
- texts,
- sounds,
- videos.


Regardless of the type, we process everything into tensors (multi-dimensional arrays). This may lead to the desire to use ML models and neural networks for analyzing unstructured data.

![](data.png)

Let's start with images.

Create a 2-dim picture with random pixels.

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid", palette="husl")

In [None]:
picture_2d = np.random.uniform(size=(28,28))
picture_2d[0:5,0:5]

In [None]:
plt.imshow(picture_2d, interpolation='nearest')
plt.show()

## What you can do with pictures - PyTorch

Load pretrain models for picture classification.

In [None]:
import urllib.request
url = 'https://pytorch.tips/coffee'
fpath = 'coffee.jpg'

# load picture
urllib.request.urlretrieve(url, fpath)

In [None]:
import matplotlib.pyplot as plt
from PIL import Image # pillow library

In [None]:
img = Image.open('coffee.jpg')
plt.imshow(img)

In [None]:
import torch
from torchvision import transforms

We will change the properties of the image slightly

In [None]:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize( 
    mean = [0.485, 0.456, 0.406],
    std = [0.229, 0.224,0.225])
])

In [None]:
img_tensor = transform(img)

Let's consider the shape of our image

In [None]:
print(type(img_tensor), img_tensor.shape)

Creating batch size - an additional dimension (for other images)

In [None]:
batch = img_tensor.unsqueeze(0)
batch.shape

In [None]:
from torchvision import models

models.list_models()[:5]

Load alexnet model 

In [None]:
# Pleas do not run on laboratories
alexnet = models.alexnet(pretrained=True)


In [None]:
# alexnet

In [None]:
alexnet.eval()
predict = alexnet(batch)

Let's write universal code that you can run on both GPU and CPU

In [None]:
_, idx = torch.max(predict,1)

In [None]:
print(idx)

In [None]:
url = 'https://pytorch.tips/imagenet-labels'
fpath = 'imagenet_class_labels.txt'
urllib.request.urlretrieve(url, fpath)

In [None]:
with open('imagenet_class_labels.txt') as f:
    classes = [line.strip() for line in f.readlines()]

classes[0:5]

In [None]:
prob = torch.nn.functional.softmax(y, dim=1)[0] *100
prob[:10]

In [None]:
classes[idx.item()], prob[idx.item()].item()

Other models

In [None]:
resnet = models.resnet101(weights=models.ResNet101_Weights.DEFAULT)

In [None]:
resnet.eval()
out = resnet(batch)

In [None]:
_, index = torch.max(out,1)
prob = torch.nn.functional.softmax(out, dim=1)[0] *100

In [None]:
classes[index.item()], prob[index.item()].item()

### More pictures with neural network

In [None]:
# 60000 obrazow 28x28

# Loading the Fashion-MNIST dataset
from torchvision import datasets, transforms
# transformacja i normalizacja danych 
transform = transforms.Compose([transforms.ToTensor(),
  transforms.Normalize((0.5,), (0.5,))
])

# Download and load the training data
trainset = datasets.FashionMNIST('MNIST_data/', download = True, train = True, transform = transform)
testset = datasets.FashionMNIST('MNIST_data/', download = True, train = False, transform = transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size = 64, shuffle = True)
testloader = torch.utils.data.DataLoader(testset, batch_size = 64, shuffle = True)


In [None]:
# just to see some pictures
dataiter = iter(trainloader)
images, labels = next(dataiter)

In [None]:
indexes = np.random.randint(0, images.shape[0], size=25)
images_rand = images[indexes]
plt.figure(figsize=(5,5))
for i in range(25):
    plt.subplot(5, 5, i+1)
    image = images_rand[i]
    plt.imshow(image[0])
    plt.axis('off')

plt.show()
plt.close('all')

In [None]:
# Define the network architecture
from torch import nn, optim
import torch.nn.functional as F

model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 10),
                      nn.LogSoftmax(dim = 1)
                     )

# Define the loss
criterion = nn.NLLLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr = 0.002)

# Define the epochs
epochs = 30

train_losses, test_losses = [], []

In [None]:
for e in range(epochs):
  running_loss = 0
  for images, labels in trainloader:
    # Flatten Fashion-MNIST images into a 784 long vector
    images = images.view(images.shape[0], -1)
    
    # Training pass
    optimizer.zero_grad()
    
    output = model.forward(images)
    loss = criterion(output, labels)
    loss.backward()
    optimizer.step()
    
    running_loss += loss.item()
  else:
    test_loss = 0
    accuracy = 0
    
    # Turn off gradients for validation, saves memory and computation
    with torch.no_grad():
      # Set the model to evaluation mode
      model.eval()
      
      # Validation pass
      for images, labels in testloader:
        images = images.view(images.shape[0], -1)
        log_ps = model(images)
        test_loss += criterion(log_ps, labels)
        
        ps = torch.exp(log_ps)
        top_p, top_class = ps.topk(1, dim = 1)
        equals = top_class == labels.view(*top_class.shape)
        accuracy += torch.mean(equals.type(torch.FloatTensor))
    
    model.train()
    train_losses.append(running_loss/len(trainloader))
    test_losses.append(test_loss/len(testloader))
    
    print("Epoch: {}/{}..".format(e+1, epochs),
          "Training loss: {:.3f}..".format(running_loss/len(trainloader)),
          "Test loss: {:.3f}..".format(test_loss/len(testloader)),
          "Test Accuracy: {:.3f}".format(accuracy/len(testloader)))

![](wykres0.png)

![](wykres1.png)

In [None]:
plt.plot(train_losses, label = "Training loss")
plt.plot(test_losses, label = "Validation loss")
plt.legend(frameon = False)

What other networks and layers can we use for analyzing unstructured data?

> Find the answer to this question in the Keras library documentation

In [None]:
print("My model: \n\n", model, "\n")
print("The state dict keys: \n\n", model.state_dict().keys())

In [None]:
torch.save(model.state_dict(), 'checkpoint.pth')

## Text data and BoW model

In [None]:
import pandas as pd
df_train = pd.read_csv("train.csv")
df_train = df_train.drop("index", axis=1)
print(df_train.head())
print(np.bincount(df_train["label"]))

In [None]:
# BoW model 
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(lowercase=True, max_features=10_000, stop_words="english")

cv.fit(df_train["text"])

In [None]:
# vocabulary.
cv.vocabulary_

In [None]:
X_train = cv.transform(df_train["text"])

In [None]:
# to dense matrix
feat_vec = np.array(X_train[0].todense())[0]
print(feat_vec.shape)
np.bincount(feat_vec)

## OOP for modeling in state space

In [None]:
import pandas as pd
import numpy as np
 
# data example
df = pd.read_csv("students.csv")
df.head()

In [None]:
len(df), list(df.columns)

In [None]:
X = df.drop(columns=['target'])
y = df['target']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# FIRST prepare pipeline

numeric_features = ['math score','reading score','writing score']
categorical_features = ['sex','race/ethnicity','parental level of education','lunch','test preparation course']

In [None]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ("num_trans", numeric_transformer, numeric_features),
    ("cat_trans", categorical_transformer, categorical_features)
])

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(steps=[
    ("preproc", preprocessor),
    ("model", LogisticRegression())
])

In [None]:
from sklearn import set_config
set_config(display='diagram')
pipeline

> Just remember - pipeline object is a python object. So you can save it as ordinary pickle object. 

In [None]:
from sklearn.model_selection import train_test_split
X_tr, X_test, y_tr, y_test = train_test_split(X,y,
test_size=0.2, random_state=42)

pipeline.fit(X_tr, y_tr)

score = pipeline.score(X_test, y_test)
print(score)

In [None]:
import joblib
joblib.dump(pipeline, 'your_pipeline.pkl')

Now the magic start's 

In [None]:
param_grid = [
              {"preproc__num_trans__imputer__strategy":
              ["mean","median"],
               "model__n_estimators":[2,5,10,100,500],
               "model__min_samples_leaf": [1, 0.1],
               "model":[RandomForestClassifier()]},
              {"preproc__num_trans__imputer__strategy":
                ["mean","median"],
               "model__C":[0.1,1.0,10.0,100.0,1000],
                "model":[LogisticRegression()]}
]

from sklearn.model_selection import GridSearchCV


grid_search = GridSearchCV(pipeline, param_grid,
cv=2, verbose=1, n_jobs=-1)


grid_search.fit(X_tr, y_tr)

grid_search.best_params_

In [None]:
grid_search.score(X_test, y_test), grid_search.score(X_tr, y_tr)

In [None]:
df['bad_feature'] = 1

In [None]:
X = df.drop(columns=['target'])
y = df['target']
X_tr, X_test, y_tr, y_test = train_test_split(X,y,
test_size=0.2, random_state=42)

In [None]:
numeric_features = ['math score','reading score','writing score', 'bad_feature']

In [None]:
grid_search = GridSearchCV(pipeline, param_grid,
cv=2, verbose=1, n_jobs=-1)

grid_search.fit(X_tr, y_tr)

grid_search.best_params_

In [None]:
grid_search.score(X_tr, y_tr), grid_search.score(X_test, y_test)

### Write your transformer

In [None]:
# your own transformator class

from sklearn.base import BaseEstimator, TransformerMixin

class DelOneValueFeature(BaseEstimator, TransformerMixin):
    """Description"""
    def __init__(self):
        self.one_value_features = []
        
    def fit(self, X, y=None):
        for feature in X.columns:
            unique = X[feature].unique()
            if len(unique)==1:
                self.one_value_features.append(feature)
        return self
    def transform(self, X, y=None):
        if not self.one_value_features:
            return X
        return X.drop(axis='columns', columns=self.one_value_features)

In [None]:
# New pipeline
pipeline2 = Pipeline([
    ("moja_transformacja",DelOneValueFeature()),
    ("preprocesser", preprocessor),
    ("classifier", LogisticRegression())])
    
pipeline2.fit(X_tr, y_tr)
score2 = pipeline2.score(X_test, y_test)

Thats all! :) 

In [None]:
# unstructured data

import tensorflow as tf

In [None]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if logs.get('accuracy') > 0.95:
            print("\n You get 95% acc - finish")
            self.model.stop_training = True

In [None]:
callbacks = myCallback()
mnist = tf.keras.datasets.fashion_mnist

In [None]:
(tr_im, tr_lab),(te_im, te_lab) = mnist.load_data()
tr_im = tr_im/255
te_im = te_im/255

model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])


model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=['accuracy'])

In [None]:
model.fit(tr_im, tr_lab, epochs=40, callbacks=[callbacks])