# Usando o MatMiner e Python para treinar SVM de dados
##Este notebook é uma continuação do anterior e agora vamos ver visualizar e ao mesmo tempo trabalhar com os dados obtidos.



In [None]:
from platform import python_version
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
from matminer.data_retrieval.retrieve_Citrine import CitrineDataRetrieval
from matminer.datasets import load_dataset
from figrecipes import PlotlyFig #pacote para gráficos

#pacotes para ML
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import cross_val_predict

#pacotes para SVM
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

#computação cientifica e tabelas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

print(python_version())

In [None]:
#importa dados do repositorio Citrine
ct = CitrineDataRetrieval("xT5QPhsWvXFYHn7P7XeqJQtt")
dc = ct.get_dataframe(criteria={'data_set_id': 154544, 'max_results':550})
#limpeza de dados
dc = dc.drop(columns = ['Elongation-units', 'Yield Strength-units' , 'Hardness-units', 'Size-units', 'Tensile Strength-units','references', 'Material Type-references','Elongation  (2 in)-units', 'Type-references', 'Yield Strength-conditions', 'ids', 'contacts', 'Reduction in Area-units', 'Impact Strength-units', 'preparation', 'Size', 'Elongation  (8 in)-references', 'Hardness-references', 'Impact Strength-references'])
dc.head()


In [None]:
#inicio da visualização
plt = PlotlyFig(dc, x_title='Tensile Strength (Pa)', 
               y_title='Total', mode='notebook')
plt.histogram(dc['Tensile Strength'])
plt = PlotlyFig(dc, x_title='Tensile Strength (Pa)', 
               y_title='Elongation (lb/pol)', mode='notebook')
plt.xy(('Tensile Strength', 'Elongation'))
dc = dc.fillna(-1)
dc.head()

In [None]:
#converte para números
dc['Tensile Strength'] = pd.to_numeric(dc['Tensile Strength'])
dc['Elongation'] = pd.to_numeric(dc['Elongation'])

#filtra os dados
dc = dc[dc['Tensile Strength'] > 1000]
dc = dc[dc['Elongation'] > 0]
dc.head()
dc.describe()


In [None]:
plt = PlotlyFig(dc, x_title='Tensile Strength (Pa)', 
y_title='Elongation (lb/pol)', mode='notebook')
plt.xy(('Tensile Strength', 'Elongation'))

In [None]:
#prepara os dados para ML
y = dc['Elongation'].values #valores a treinar
x = dc.drop('Elongation', axis=1) #descritores
print(y) 
print("Há {} Descritores possíveis:\n\n{}".format(x.shape[1], x.columns.values))

In [None]:
x.head()

In [None]:
limpar = ["names", "composition"]
x = x.drop(limpar, axis=1)
limpar = ["Elongation in 2 in-units", "Material Type", "Form"]
x = x.drop(limpar, axis=1)
x.head()

In [None]:
limpar = ["Elongation in 2 in", "Elongation  (8 in)"]
x = x.drop(limpar, axis=1)
limpar = ["Elongation  (8 in)-units", "Type", "Elongation  (2 in)"]
x = x.drop(limpar, axis=1)
x.head()

In [None]:
#cria um modelo simples
x = x.fillna(-1)
lr = LinearRegression()
lr.fit(x, y)
print('training R2 = ' + str(round(lr.score(x, y), 3)))
print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y, y_pred=lr.predict(x))))

In [None]:
#10-fold cross validation (90% training, 10% test)
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_val_score(lr, x, y, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=1)
rmse_scores = [np.sqrt(abs(s)) for s in scores]
r2_scores = cross_val_score(lr, x, y, scoring='r2', cv=crossvalidation, n_jobs=1)

print('Cross-validation results:')
print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores))))
print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))

In [None]:
print(y)
y_pred = lr.predict(x)
print(y_pred)

In [None]:
#visualização
pf = PlotlyFig(x_title='Elongation (lb/pol)',
               y_title='Previsão (lb/pol)',
               title='Regressão linear',
               mode='notebook',
               filename="lr_regression.html")

pf.xy(xy_pairs=[(y, cross_val_predict(lr, x, y, cv=crossvalidation)), ([5, 40], [5, 40])], 
     modes=['markers', 'lines'],
      lines=[{}, {'color': 'black', 'dash': 'dash'}], 
      showlegends=False
     )

In [None]:
x.head()

In [None]:
#previsão
prev = pd.DataFrame({'Tensile Strength':[50000],
                      'Yield Strength':[25000],
                      'Reduction in Area':[63],
                      'Hardness':[20],
                      'Impact Strength':[45],
                      'Elongation in 2 in':[-1],'Elongation (8 in)':[-1], 'Elongation (8 in)-units':[-1],
                      'Type':[-1],
                      'Elongation (2 in)':[-1]})

In [None]:
prev.head()

In [None]:
elong = lr.predict(prev)

In [None]:
print(elong)

In [None]:
#Vamos fazer o modelo usando Support Vector Machines (SVM)
# SVM RBF
svr_rbf = SVR(kernel="poly", C=10, gamma='scale',degree=1, max_iter = 1000)

In [None]:
print(x)
print(y)

In [None]:
#normalizar os dados para SVM
#norm = [np.log(i) for i in y]
#print(norm) 


In [None]:
#divide em treino e teste
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=21)

In [None]:
print(y_train)
print(y_test)

In [None]:
svr_rbf.fit(x_train,y_train)
pred = svr_rbf.predict(x_test)

In [None]:
print(pred)
print(y_test)

In [None]:
print(svr_rbf.score(x_train,y_train))

In [None]:
print(svr_rbf.score(x_test,y_test))

In [None]:
#visualização
pf = PlotlyFig(x_title='Elongation (lb/pol)',
               y_title='Previsão (lb/pol)',
               title='Support Vector Machines',
               mode='notebook',
               filename="lr_regression.html")

pf.xy(xy_pairs=[(y_test, pred), ([5, 40], [5, 40])], 
     modes=['markers', 'lines'],
      lines=[{}, {'color': 'black', 'dash': 'dash'}], 
      showlegends=False
     )



In [None]:
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=1)
svr_rbf.fit(x,y)
scores = cross_val_score(svr_rbf, x, y, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=1)
rmse_scores = [np.sqrt(abs(s)) for s in scores]
r2_scores = cross_val_score(svr_rbf, x, y, scoring='r2', cv=crossvalidation, n_jobs=1)

print('Cross-validation results:')
print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores))))
print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))

In [None]:
#visualização
pf = PlotlyFig(x_title='Elongation (lb/pol)',
               y_title='Previsão (lb/pol)',
               title='Support Vector Machines',
               mode='notebook',
               filename="lr_regression.html")

pf.xy(xy_pairs=[(y, pred), ([5, 40], [5, 40])], 
     modes=['markers', 'lines'],
      lines=[{}, {'color': 'black', 'dash': 'dash'}], 
      showlegends=False
     )

In [None]:
pred = svr_rbf.predict(x)

In [None]:
print(pred)
print(y)

In [None]:
#previsão
prev = pd.DataFrame({'Tensile Strength':[50000],
                      'Yield Strength':[25000],
                      'Reduction in Area':[63],
                      'Hardness':[20],
                      'Impact Strength':[45],
                      'Elongation in 2 in':[-1],'Elongation (8 in)':[-1], 'Elongation (8 in)-units':[-1],
                      'Type':[-1],
                      'Elongation (2 in)':[-1]})
elong1 = svr_rbf.predict(prev)
print(elong1)

In [None]:
#SVM com kernel polynomial
#SVM polinomial
svr_poly = SVR(kernel="poly", C=1, gamma=0.1, epsilon=0.1, coef0=1)

In [None]:
svr_poly.fit(x,y)
pred = svr_poly.predict(x)
print(y)
print(pred)

In [None]:
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=1)
svr_poly.fit(x,y)
scores = cross_val_score(svr_rbf, x, y, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=1)
rmse_scores = [np.sqrt(abs(s)) for s in scores]
r2_scores = cross_val_score(svr_rbf, x, y, scoring='r2', cv=crossvalidation, n_jobs=1)

print('Cross-validation results:')
print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores))))
print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))

In [None]:
#visualização
pf = PlotlyFig(x_title='Elongation (lb/pol)',
               y_title='Previsão (lb/pol)',
               title='Support Vector Machines',
               mode='notebook',
               filename="lr_regression.html")

pf.xy(xy_pairs=[(y, pred), ([5, 40], [5, 40])], 
     modes=['markers', 'lines'],
      lines=[{}, {'color': 'black', 'dash': 'dash'}], 
      showlegends=False
     )