In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('loan.csv')

In [3]:
X = data.drop(columns=['Loan_Status','Loan_ID'])
y = data['Loan_Status']

In [4]:
X = X.fillna({'Gender':'NA','Married':'No','Dependents':'0','Self_Employed':'No',
             'LoanAmount':X['LoanAmount'].mean(),'Loan_Amount_Term':X['Loan_Amount_Term'].mean(),
             'Credit_History':0.0})

In [5]:
X = pd.get_dummies(X,columns=['Gender','Married','Dependents','Education','Self_Employed','Property_Area'])

In [6]:
X

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Gender_NA,Married_No,Married_Yes,...,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,5849,0.0,146.412162,360.0,1.0,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,1
1,4583,1508.0,128.000000,360.0,1.0,0,1,0,0,1,...,1,0,0,1,0,1,0,1,0,0
2,3000,0.0,66.000000,360.0,1.0,0,1,0,0,1,...,0,0,0,1,0,0,1,0,0,1
3,2583,2358.0,120.000000,360.0,1.0,0,1,0,0,1,...,0,0,0,0,1,1,0,0,0,1
4,6000,0.0,141.000000,360.0,1.0,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,2900,0.0,71.000000,360.0,1.0,1,0,0,1,0,...,0,0,0,1,0,1,0,1,0,0
610,4106,0.0,40.000000,180.0,1.0,0,1,0,0,1,...,0,0,1,1,0,1,0,1,0,0
611,8072,240.0,253.000000,360.0,1.0,0,1,0,0,1,...,1,0,0,1,0,1,0,0,0,1
612,7583,0.0,187.000000,360.0,1.0,0,1,0,0,1,...,0,1,0,1,0,1,0,0,0,1


### GridSearch

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8)

In [9]:
# Regressão logística
lr = LogisticRegression()
# param_grid é um dicionário no python que passa uma rede de params para o GridSearchCV
param_grid = {'solver':['saga','lbfgs'],
              'penalty':[None,'l1','l2'],
              'C':np.arange(0.1,1.1,0.1),
              'max_iter':[100,200,500,1000]}
# GridSearchCV: para conseguirmos fazer a busca dos dados mais completa, 
# passando parâmetros dinâmicos e retornando o melhor resultado e melhor combinação
# Params:
## Modelo: lr
## param_grid: lista dos atributos
## n_jobs:
## cv: cross validation
gs = GridSearchCV(lr,param_grid=param_grid,n_jobs=-1,cv=5)
gs.fit(X_train,y_train)
gs.score(X_test,y_test)

0.7398373983739838

In [10]:
gs.best_score_

0.7575963718820862

In [11]:
gs.best_params_

{'C': 0.5, 'max_iter': 200, 'penalty': 'l2', 'solver': 'lbfgs'}

In [12]:
# Modelo que deve ir para a produção
lr = LogisticRegression(C=0.7, max_iter=200, penalty='l2', solver='lbfgs')
lr.fit(X, y)

LogisticRegression(C=0.7, max_iter=200)

In [13]:
# GridSearch com pipeline
ct = ColumnTransformer([('min_max',MinMaxScaler(),['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term'])],remainder='passthrough')
pipeline = Pipeline([('transformacao',ct),
                     ('poly',PolynomialFeatures()),
                     ('model',LogisticRegression())])

param_grid = {'poly__degree':[2,3],
              'model__solver':['saga','lbfgs'],
              'model__penalty':[None,'l1','l2'],
              'model__C':np.arange(0.1,1.1,0.1),
              'model__max_iter':[100,200,500,1000]}

# O modelo do GridSearchCV é o pipeline
search = GridSearchCV(pipeline,param_grid=param_grid,n_jobs=-1,cv=5,verbose=1)
search.fit(X_train,y_train)
search.score(X_test,y_test)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  59 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 220 tasks      | elapsed:   40.7s
[Parallel(n_jobs=-1)]: Done 516 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 920 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 1433 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 2012 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed: 13.4min finished


0.7886178861788617

In [14]:
search.best_score_

0.7657596371882086

In [15]:
search.best_params_

{'model__C': 0.1,
 'model__max_iter': 100,
 'model__penalty': 'l1',
 'model__solver': 'saga',
 'poly__degree': 2}

### Machine Learning para textos

In [16]:
from sklearn.datasets import fetch_20newsgroups

In [17]:
categorias = ['alt.atheism', 'soc.religion.christian','comp.graphics','sci.med']

In [18]:
data_train = fetch_20newsgroups( subset='train',categories=categorias)

In [19]:
data_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [20]:
len(data_train.data)

2257

In [21]:
print(data_train.data[0])

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.



In [22]:
d = data_train.target[0]
print(d)
print(data_train.target_names[d])

1
comp.graphics


In [23]:
for d in data_train.target[:20]:
    print(data_train.target_names[d])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med
soc.religion.christian
comp.graphics
alt.atheism
alt.atheism
comp.graphics
comp.graphics
sci.med
alt.atheism
soc.religion.christian
alt.atheism


In [24]:
import pandas as pd
import numpy as np

In [25]:
data = pd.DataFrame(data=data_train.data, columns=['texto'])

In [26]:
data.head()

Unnamed: 0,texto
0,From: sd345@city.ac.uk (Michael Collier)\nSubj...
1,From: ani@ms.uky.edu (Aniruddha B. Deglurkar)\...
2,From: djohnson@cs.ucsd.edu (Darin Johnson)\nSu...
3,From: s0612596@let.rug.nl (M.M. Zwart)\nSubjec...
4,From: stanly@grok11.columbiasc.ncr.com (stanly...


In [27]:
X = data_train.data
y = data_train.target

In [28]:
print(X[0])

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.



In [29]:
# Vetorizador[CountVectorizer]: pega as várias palavras e faz uma contagem
### No primeiro email. quantas vezes apareceu a palavra 'converting'
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
# Cria o vetorizador
## lowercase=True -> para não diferenciar maiúscula e minúscula
count_vect = CountVectorizer(lowercase=True)
# Aplica o fit transform com o count_vect
# Pega as várias palavras que temos no email e transforma em colunas com números
X_train = count_vect.fit_transform(X)

##### testes

In [31]:
X_train[0]

<1x35788 sparse matrix of type '<class 'numpy.int64'>'
	with 73 stored elements in Compressed Sparse Row format>

In [32]:
X_train[0].todense()

matrix([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [33]:
# u -> Converte no formato unicode
count_vect.vocabulary_.get(u'converting')

9805

In [34]:
# A palavra 'converting' apareceu duas vezes
X_train[0].todense()[0,9805]

2

In [35]:
codigo = count_vect.vocabulary_.get(u'computer')
X_train[0].todense()[0,codigo]

1

##### fim dos testes

In [36]:
# Naive Bayes: Modelo de regressão muito utiliada para textos
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train,y)

MultinomialNB()

In [37]:
testes = ['God is love', 'OpenGL is fast', 'This cell is a cancer']
test = count_vect.transform(testes)

In [38]:
preds = nb.predict(test)

In [39]:
for doc, categoria in zip(testes,preds):
    print(f'Texto: {doc} \nClasse: {data_train.target_names[categoria]}\n')

Texto: God is love 
Classe: soc.religion.christian

Texto: OpenGL is fast 
Classe: comp.graphics

Texto: This cell is a cancer 
Classe: sci.med



##### Avaliando o modelo

In [40]:
# Separando em treino e teste com o fetch_20newsgroups
data_teste = fetch_20newsgroups(subset='test',categories=categorias)
X_test = data_teste.data
y_test = data_teste.target

In [41]:
X_test = count_vect.transform(X_test)

In [42]:
nb.score(X_test,y_test)

0.9340878828229028

##### Adicionando um pipeline

In [43]:
from sklearn.datasets import fetch_20newsgroups
categorias = ['alt.atheism', 'soc.religion.christian','comp.graphics','sci.med']
data_treino = fetch_20newsgroups(subset='train',categories=categorias)
data_teste = fetch_20newsgroups(subset='test',categories=categorias)

In [44]:
X_train = data_treino.data
X_test = data_teste.data
y_train = data_treino.target
y_test = data_teste.target

In [45]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [46]:
pipeline = Pipeline([('count_vect',CountVectorizer(lowercase=True,stop_words='english')),('model',MultinomialNB())])

In [47]:
pipeline.fit(X_train,y_train)
pipeline.score(X_test,y_test)

0.9420772303595206

In [48]:
pipeline = Pipeline([('count_vect',TfidfVectorizer(lowercase=True,stop_words='english')),('model',MultinomialNB())])

In [49]:
pipeline.fit(X_train,y_train)
pipeline.score(X_test,y_test)

0.8894806924101198

### Trabalho 1

Carregar o conjunto de dados do site de complaints e categorizar o tipo de produto baseado no texto do usuário

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('./complaints.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
data.shape

(2146121, 18)

In [5]:
X = data['Consumer complaint narrative']
y = data['Product']

##### alternativa de leitura

In [None]:
data = pd.read_csv('../complaints.csv',chunksize=10000)
X = []
y = []
for linha in data:
    X.extend(linha['Consumer complaint narrative'])
    y.extend(linha['Product'])

In [6]:
len(X)

2146121

In [None]:
textos = []
for i,j in zip(X,y):
    textos.append([i,j])

In [None]:
data = pd.DataFrame(textos,columns=['narrativa','produto'])

In [None]:
data