In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Exploration

---

In [None]:
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
df.head()

# Preprocessamento de dados
---

In [None]:
from sklearn.model_selection import train_test_split

# Definindo features e labels
X = df.iloc[:,:-1]
Y = df['SalePrice']

# Separando entre treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=3)

### Processamento de textos - categorico
---

In [None]:
df_ref = X_train

In [None]:
from sklearn import preprocessing 

df_text = df_ref.select_dtypes(include=['object'])

# Criando meu encoder
ohe = preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=False)
ohe.fit(df_text)

# Transformando texto -> numero
X_text = ohe.transform(df_text)

# Criando DataFrame texto -> numerico
df_text = pd.DataFrame(X_text, columns=ohe.get_feature_names(df_text.columns))

In [None]:
df_text.head()

In [None]:
# Criando o DataFrame numerico
df_num = df_ref.select_dtypes(exclude=['object'])
df_num.head()

In [None]:
# DataFrame com textos processados
df_proc = pd.concat([df_num.reset_index(drop=True), df_text.reset_index(drop=True)], axis=1)
df_proc.head()

### Processamento numérico
---

In [None]:
# Substituindo valores nulos
# X_train_ = df_proc.fillna(0)

from sklearn.impute import KNNImputer

knn = KNNImputer()
knn.fit(df_proc)

X = knn.transform(df_proc)
X_train_ = pd.DataFrame(X, columns=df_proc.columns)

X_train_.head()

In [None]:
X_train_.dropna().shape, df_proc.shape

In [None]:
# Normalização do dataset
ss = preprocessing.StandardScaler()
X_train_ = ss.fit_transform(X_train_)

X_train_ = pd.DataFrame(X_train_, columns=df_proc.columns)

In [None]:
from sklearn.decomposition import PCA

# Verificando a importancia de cada dimensão
pca = PCA()
pca.fit(X_train_)

In [None]:
features = range(pca.n_components_)

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = list(features),
        y = 1 - pca.explained_variance_ratio_
    )
)

fig.show()

In [None]:
# Redução de dimensionalidade
pca = PCA(n_components=255)
X_train_red = pca.fit_transform(X_train_)

In [None]:
X_train_red.shape

# Modelo de Machine Learning
---

In [None]:
df_text_test = X_test.select_dtypes(include=['object'])

# Transformando texto -> numero
X_text_test = ohe.transform(df_text_test)

# Criando DataFrame texto -> numerico
df_text_test = pd.DataFrame(X_text_test, columns=ohe.get_feature_names(df_text_test.columns))

# Criando o DataFrame numerico
df_num_test = X_test.select_dtypes(exclude=['object'])

df_proc_test = pd.concat([df_num_test.reset_index(drop=True), df_text_test.reset_index(drop=True)], axis=1)

X_test_ = knn.transform(df_proc_test)

X_test_ = ss.transform(X_test_)

X_test_red = pca.transform(X_test_)

In [None]:
X_test_red.shape

### Support Vector Machines

In [None]:
from sklearn.svm import SVR

svr = SVR(C=100_000, epsilon=0.01, gamma='auto')
svr.fit(X_train_red, y_train)

In [None]:
y_pred = svr.predict(X_train_red)
(y_pred - y_train).abs().mean()

In [None]:
y_pred = svr.predict(X_test_red)
(y_pred - y_test).abs().mean()

### Decision Trees

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=300, max_depth=15)
rf.fit(X_train_red, y_train)

In [None]:
y_pred = rf.predict(X_train_red)
(y_pred - y_train).abs().mean()

In [None]:
y_pred = rf.predict(X_test_red)
(y_pred - y_test).abs().mean()

### Linear Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

rid = Ridge(alpha=1.0)
rid.fit(X_train_red, y_train)

In [None]:
y_pred = rid.predict(X_train_red)
(y_pred - y_train).abs().mean()

In [None]:
y_pred = rid.predict(X_test_red)
(y_pred - y_test).abs().mean()

### XGBoost

In [None]:
import xgboost as xgb

xgbreg = xgb.XGBRegressor(
    n_estimators=200, 
    max_depth=3, 
    learning_rate=0.1,
    scale_pos_weight=0.1
)
xgbreg.fit(X_train_red, y_train)

In [None]:
y_pred = xgbreg.predict(X_train_red)
(y_pred - y_train).abs().mean()

In [None]:
y_pred = xgbreg.predict(X_test_red)
(y_pred - y_test).abs().mean()

# Low Code - Solutions
---

In [None]:
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
df.head()

In [None]:
import 

In [None]:
!pip install pycaret==1.0

In [None]:
from pycaret.regression import *

exp_reg = setup(data=df, target='SalePrice', session_id=2)

In [None]:
# XGBoost
xgb = tune_model('xgboost')

In [None]:
xgb

In [None]:
# CatBoost
cat = tune_model('catboost')

In [None]:
# LightGBM
gbm = tune_model('lightgbm')

In [None]:
interpret_model(gbm)

In [None]:
y_pred = predict_model(gbm, data=)

### Processamento de textos - Semantico
---

In [None]:
df_wat = pd.read_csv('../input/contradictory-my-dear-watson/train.csv')
df_wat.head()

In [None]:
df_wat_En = df_wat.query("language == 'English'")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec_prem = TfidfVectorizer()
vec_hyp = TfidfVectorizer()

vec_prem.fit(df_wat_En['premise'])
vec_hyp.fit(df_wat_En['hypothesis'])

In [None]:
X_prem = vec_prem.transform(df_wat_En['premise']).todense()
df_prem = pd.DataFrame(X_prem, columns=vec_prem.get_feature_names())

X_hyp = vec_hyp.transform(df_wat_En['hypothesis']).todense()
df_hyp = pd.DataFrame(X_hyp, columns=vec_hyp.get_feature_names())

In [None]:
df_prem.columns = [f"premise_{c}" for c in df_prem.columns]

In [None]:
df_hyp.columns = [f"hypothesis_{c}" for c in df_hyp.columns]

In [None]:
df_text = pd.concat([df_prem, df_hyp], axis=1)

In [None]:
from sklearn.decomposition import SparsePCA, PCA

pca = PCA(n_components=500)
res = pca.fit_transform(df_text)

In [None]:
res.shape, df_text.shape

## 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

with open('../input/shakespeare/shakespeare.txt', 'r') as handle:
    shaks_data = handle.read().replace('\n', ' ')

In [None]:
shaks_data

In [None]:
vec = TfidfVectorizer()
vec.fit([shaks_data])

In [None]:
vec.transform([shaks_data, shaks_data]).todense()

In [None]:
proc = vec.transform([shaks_data]).todense()

In [None]:
df_shak = pd.DataFrame({"prop": proc.tolist()[0], "words": vec.get_feature_names()})

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x = list(range(df_shak.shape[0])),
        y = df_shak.sort_values('prop', ascending=False)['prop'],
        mode = "markers"
    )
)

fig.update_yaxes(type="log")
fig.update_xaxes(type="log")
fig.show()

In [None]:
import cv2
  
# Save image in set directory
# Read RGB image
img = cv2.imread('../input/plant-pathology-2021-fgvc8/test_images/85f8cb619c66b863.jpg') 


In [None]:
# Output Images
plt.imshow(img)

# Modelos de Machine Learning
---

In [None]:
import xgboost as xgb

model = xgb.XGBRegressor(n_estimators=300, max_depth=2)
model.fit(X_train, Y_train)