In [7]:
import pandas as pd

In [8]:
pd.options.mode.chained_assignment = None

In [9]:
df = pd.read_csv('student_records.csv')
df

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Henry,A,Y,90,85,Yes
1,John,C,N,85,51,Yes
2,David,F,N,10,17,No
3,Holmes,B,Y,75,71,No
4,Marvin,E,N,20,30,No
5,Simon,A,Y,92,79,Yes
6,Robert,B,Y,60,59,No
7,Trent,C,Y,75,33,No


<h4> Preparando os dados

O datasaet que vimos antes, não possui nenhum erro, então, podemos passar para a próxima etapa

<h4> Extração de Features e Engenharia

In [10]:
colunasFeatures = ['OverallGrade', 'Obedient', 'ResearchScore','ProjectScore']
dfFeatures = df[colunasFeatures]
dfFeatures

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,90,85
1,C,N,85,51
2,F,N,10,17
3,B,Y,75,71
4,E,N,20,30
5,A,Y,92,79
6,B,Y,60,59
7,C,Y,75,33


In [11]:
colunaResultado = ['Recommend']
dfResultado = df[colunaResultado]
dfResultado

Unnamed: 0,Recommend
0,Yes
1,Yes
2,No
3,No
4,No
5,Yes
6,No
7,No


In [12]:
atributosNumericos = ['ResearchScore', 'ProjectScore']
atributosCategoricos = ['OverallGrade', 'Obedient']

<h2> Normalizar os valores numéricos

In [13]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

#Aplicando o SS nos atributos numéricos
ss.fit(dfFeatures[atributosNumericos])
#Aplicando no DF
dfFeatures[atributosNumericos] = ss.transform(dfFeatures[atributosNumericos])

dfFeatures

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,0.899583,1.37665
1,C,N,0.730648,-0.091777
2,F,N,-1.80339,-1.560203
3,B,Y,0.392776,0.772004
4,E,N,-1.465519,-0.998746
5,A,Y,0.967158,1.117516
6,B,Y,-0.114032,0.253735
7,C,Y,0.392776,-0.869179


<h2> Lidando com os atributos categóricos

In [14]:
#Transformando a coluna OverralGrade e Obedient em "dummies"

dfFeatures = pd.get_dummies(dfFeatures, columns=atributosCategoricos)
dfFeatures

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_B,OverallGrade_C,OverallGrade_E,OverallGrade_F,Obedient_N,Obedient_Y
0,0.899583,1.37665,1,0,0,0,0,0,1
1,0.730648,-0.091777,0,0,1,0,0,1,0
2,-1.80339,-1.560203,0,0,0,0,1,1,0
3,0.392776,0.772004,0,1,0,0,0,0,1
4,-1.465519,-0.998746,0,0,0,1,0,1,0
5,0.967158,1.117516,1,0,0,0,0,0,1
6,-0.114032,0.253735,0,1,0,0,0,0,1
7,0.392776,-0.869179,0,0,1,0,0,0,1


In [15]:
atributosCategoricosProjetados = list(set(dfFeatures.columns) - set(atributosNumericos)) 
atributosCategoricosProjetados

['OverallGrade_C',
 'OverallGrade_E',
 'OverallGrade_A',
 'Obedient_N',
 'OverallGrade_F',
 'OverallGrade_B',
 'Obedient_Y']

<h2> Modelando

In [16]:
#Fazendo um modelo simples usando regressão logística
from sklearn.linear_model import LogisticRegression
import numpy as np

#Fit no modelo
lr = LogisticRegression()
model = lr.fit(dfFeatures,np.array(dfResultado['Recommend']))
model

LogisticRegression()

<h2> Avaliação do modelo

In [17]:
#Predição simples do modelo
labelsPredict = model.predict(dfFeatures)
labelsResultado = np.array(dfResultado['Recommend'])

#Avaliar a performance do modelo
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print("Accuracy: ",float(accuracy_score(labelsResultado,labelsPredict))*100,'%')
print("Status da classificação: ")
print(classification_report(labelsResultado,labelsPredict))

Accuracy:  100.0 %
Status da classificação: 
              precision    recall  f1-score   support

          No       1.00      1.00      1.00         5
         Yes       1.00      1.00      1.00         3

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



<h2> Deploy do modelo

In [20]:
# Precisamos adicionar persistência no modelo antes de dar deploy, nós também precisamos salvar o objeto escalar que usamos para escalar
# as features numéricas

#from sklearn.externals import joblib
import joblib
import os

if not os.path.exists('Model'):
    os.mkdir("Model")
    
if not os.path.exists('Scaler'):
    os.mkdir("Scaler")
    
joblib.dump(model,r'Model/model.pickle')
joblib.dump(ss,r'Scaler/scaler.pickle')

['Scaler/scaler.pickle']

<h2> Carregando os modelos na memória

In [23]:
model = joblib.load(r'Model/model.pickle')
scaler = joblib.load(r'Scaler/scaler.pickle')

model

LogisticRegression()

<h2> Adicionando novos elementos no modelo para predição

In [24]:
new_data = pd.DataFrame([{'Name': 'Nathan', 'OverallGrade': 'F','Obedient': 'N', 'ResearchScore': 30, 'ProjectScore': 20},
{'Name': 'Thomas', 'OverallGrade': 'A','Obedient': 'Y', 'ResearchScore': 78, 'ProjectScore': 80}])
new_data = new_data[['Name', 'OverallGrade', 'Obedient','ResearchScore', 'ProjectScore']]
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore
0,Nathan,F,N,30,20
1,Thomas,A,Y,78,80


In [25]:
#Preparação dos novos dados
predictFeatures = new_data[colunasFeatures]

#Scaling
predictFeatures[atributosNumericos] = scaler.transform(predictFeatures[atributosNumericos])

#Atributos categoricos
predictFeatures = pd.get_dummies(predictFeatures,columns=atributosCategoricos)

predictFeatures

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y
0,-1.127647,-1.430636,0,1,1,0
1,0.494137,1.160705,1,0,0,1


In [27]:
#Adicionando as colunas faltantes
colunasCategoricasAtuais = set(predictFeatures.columns) - set(atributosNumericos)

featuresFaltantes = set(atributosCategoricosProjetados) - colunasCategoricasAtuais

for feature in featuresFaltantes:
    predictFeatures[feature] = [0] * len(predictFeatures)

predictFeatures

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y,OverallGrade_C,OverallGrade_E,OverallGrade_B
0,-1.127647,-1.430636,0,1,1,0,0,0,0
1,0.494137,1.160705,1,0,0,1,0,0,0


<h2> Fazendo a predição usando o model

In [29]:
predicts = model.predict(predictFeatures)

new_data['Recommend'] = predicts
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Nathan,F,N,30,20,No
1,Thomas,A,Y,78,80,Yes
