# PMR3508 - Aprendizado de Máquina e Reconhecimento de Padrões
## Base Costa Rican Household

### Import das bibliotecas

In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt

### Leitura dos dados

In [None]:
dfTrain = pd.read_csv("../input/train.csv",
          sep=r'\s*,\s*',
          engine='python',
          na_values="")

In [None]:
dfTest = pd.read_csv("../input/test.csv",
         sep=r'\s*,\s*',
         engine='python',
         na_values="")

### Tratamento dos dados

In [None]:
# dfTrain = dfTrain.dropna() # Retirada de NA
dfTrain = dfTrain.apply(lambda x: x.fillna(x.value_counts().index[0])) # Substituicao pelo mais frequente

In [None]:
# dfTest = dfTest.dropna() # Retirada de NA
dfTest = dfTest.apply(lambda x: x.fillna(x.value_counts().index[0])) # Substituicao pelo mais frequente

### Análise dos dados

In [None]:
dfTrain.shape

In [None]:
dfTrain.describe()

In [None]:
dfTrain.select_dtypes(exclude='number').head()

Como as features string não podem ser analisadas facilmente, precisam ser pré-processadas

In [None]:
strList = ["Id", "idhogar", "dependency", "edjefe", "edjefa"]
dfAll = pd.concat([dfTrain[strList], dfTest[strList]]).apply(preprocessing.LabelEncoder().fit_transform)
bkpDfTrain = dfTrain.copy()
bkpDfTest = dfTest.copy()
dfTrain[strList] = dfAll.iloc[:dfTrain.shape[0]]
dfTest[strList] = dfAll.iloc[dfTrain.shape[0]:]
dfTrain.describe()

In [None]:
labels = dfTrain.columns
labels = labels.drop("Target")
dfTrain[labels] = (dfTrain[labels] - dfTrain[labels].mean()) / dfTrain[labels].std()
dfTest[labels] = (dfTest[labels] - dfTrain[labels].mean()) / dfTrain[labels].std()
dfTrain.corr()["Target"].sort_values()

hogar_nin, r4t1, SQBhogar_nin, meaneduc, cielorazo e escolari são as features mais interessantes a serem utilizadas

### Classificação

In [None]:
features = ["hogar_nin", "r4t1", "SQBhogar_nin", "meaneduc", "cielorazo", "escolari"]

In [None]:
XTrain = dfTrain[features]
YTrain = bkpDfTrain.Target
XTest = dfTest[features]

In [None]:
scores = []
for i in range(1, 200, 10):
    knn = KNeighborsClassifier(n_neighbors=i)
    scores.append(cross_val_score(knn, XTrain, YTrain, cv=10).mean())
plt.plot(range(1, 200, 10), scores)
knn = KNeighborsClassifier(n_neighbors=81) # Melhor resultado

In [None]:
knn.fit(XTrain, YTrain)

In [None]:
YTest = knn.predict(XTest)
YTest

In [None]:
dfSave = pd.DataFrame(data={"Id" : bkpDfTest["Id"], "Target" : YTest})
pd.DataFrame(dfSave[["Id", "Target"]], columns = ["Id", "Target"]).to_csv("Output.csv", index=False)