### Nome dos integrantes
### Aluno 1: Aline Cristini - 183132
### Aluno 2: Camila Rodrigues - 183143
### Aluno 3: Rafael Gimenes Leite - 101634
### Aluno 4: Vitor Damázio - 090773

# **Bibliotecas**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly
import plotly.plotly as py
import plotly.figure_factory as ff

from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

import math
%matplotlib inline
import seaborn as sns

init_notebook_mode(connected=True)

import warnings
from collections import Counter
warnings.filterwarnings('ignore')

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc
from sklearn.model_selection import StratifiedKFold,GridSearchCV
import missingno as mssno
seed =45
% matplotlib inline

# **Leitura do arquivo**

In [None]:
#importando dados
#fonte: https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data
test = pd.read_csv('../input/test.csv', header=0)
train = pd.read_csv('../input/train.csv', header=0)

In [None]:
train.head()

# **Removendo Registros Duplicados**

In [None]:
#Removendo registros duplicados do test
print('Antes:', test.shape)
test.drop_duplicates()
print('Depois:', test.shape)

In [None]:
#Removendo registros duplicados do train
print('Antes:', train.shape)
train.drop_duplicates()
print('Depois:', train.shape)

In [None]:
#Análise dos conjuntos de dados para observar a distribuição dos atributos
display(train.describe())
display(test.describe())

# **Criando Metadados**

In [None]:
#Criação dos Metadados do conjunto de treino
data = []
for f in train.columns:
    # definindo o uso (entre rótulo, id e atributos)
    if f == 'target':
        role = 'target' # rótulo
    elif f == 'id':
        role = 'id'
    else:
        role = 'input' # atributos
         
    # definindo o tipo do dado
    if 'bin' in f or f == 'target':
        level = 'binary'
    elif 'cat' in f or f == 'id':
        level = 'nominal'
    elif train[f].dtype == float:
        level = 'interval'
    elif train[f].dtype == int:
        level = 'ordinal'
        
    # mantem keep como verdadeiro pra tudo, exceto id
    keep = True
    if f == 'id':
        keep = False
    
    # cria o tipo de dado
    dtype = train[f].dtype
    
    # cria dicionário de metadados
    f_dict = {
        'varname': f,
        'role': role,
        'level': level,
        'keep': keep,
        'dtype': dtype
    }
    data.append(f_dict)
    
meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace=True)

In [None]:
#Visualização dos Metadados do conjunto de treino
meta

In [None]:
#Contagem dos atributos por tipo de uso e dado
pd.DataFrame({'count' : meta.groupby(['role', 'level'])['role'].size()}).reset_index()

# **Análise do TARGET**

In [None]:
df_barras = train['target'].value_counts().reset_index(name='count')
df_barras

In [None]:
#Como os dados do dataset estão balanceados
x = df_barras['index']
y = df_barras['count']

trace1 = go.Bar(
    x= x,
    y=y,
    text=y,
    name='Distibuição do Target',
    textposition = 'auto',
    marker=dict(
        color='rgb(158,202,225)',
        line=dict(
            color='rgb(8,48,107)',
            width=1.5),
        ),
    opacity=0.6
)

data = [trace1]
plotly.offline.iplot(data, filename='grouped-bar-direct-labels')

In [None]:
arr = df_barras['index'].values
soma = df_barras['count'].values

In [None]:
#Como percebido o dataset tem um grande desbalancemente do target
colors = ['rgb(158,202,225)', 'rgb(8,48,107)']
trace = go.Pie(labels=arr, values=soma, marker=dict(colors=colors))

data = [trace]
plotly.offline.iplot(data, filename='basic_pie_chart')

# **Dados Faltantes**

In [None]:
#Verificar valores faltantes
atributos_missing = []

for f in train.columns:
    missings = train[train[f] == -1][f].count()
    if missings > 0:
        atributos_missing.append(f)
        missings_perc = missings/train.shape[0]
        
        print('Atributo {} tem {} amostras ({:.2%}) com valores faltantes'.format(f, missings, missings_perc))
        
print('No total, há {} atributos com valores faltantes'.format(len(atributos_missing)))

In [None]:
# removendo ps_car_03_cat e ps_car_05_cat que tem muitos valores faltantes
vars_to_drop = ['ps_car_03_cat', 'ps_car_05_cat']
train = train.drop(vars_to_drop, axis=1)
test = test.drop(vars_to_drop, axis=1)
meta.loc[(vars_to_drop),'keep'] = False  # atualiza os metadados para ter como referência (processar o test depois)

In [None]:
# Preenchendo dados faltantes dos atributos com valores faltantes
from sklearn.preprocessing import Imputer

media_imp = Imputer(missing_values=-1, strategy='mean', axis=0)
moda_imp = Imputer(missing_values=-1, strategy='most_frequent', axis=0)
train['ps_reg_03'] = media_imp.fit_transform(train[['ps_reg_03']]).ravel()
train['ps_car_12'] = media_imp.fit_transform(train[['ps_car_12']]).ravel()
train['ps_car_14'] = media_imp.fit_transform(train[['ps_car_14']]).ravel()
train['ps_car_11'] = moda_imp.fit_transform(train[['ps_car_11']]).ravel()

test['ps_reg_03'] = media_imp.fit_transform(test[['ps_reg_03']]).ravel()
test['ps_car_12'] = media_imp.fit_transform(test[['ps_car_12']]).ravel()
test['ps_car_14'] = media_imp.fit_transform(test[['ps_car_14']]).ravel()
test['ps_car_11'] = moda_imp.fit_transform(test[['ps_car_11']]).ravel()

# **Análise da Correlação**

In [None]:
# Separando o arquivo do treino entre float e int
train_float = train.select_dtypes(include=['float64'])
train_int = train.select_dtypes(include=['int64'])
Counter(train.dtypes.values)

In [None]:
# Verificando a correlação entre as variáveis float
colormap = plt.cm.jet
plt.figure(figsize=(16,12))
plt.title('Correlação de Pearson dados float', y=1.05, size=15)
sns.heatmap(train_float.corr(),linewidths=0.1,vmax=1.0, square=True, cmap='Blues', linecolor='white', annot=True)

In [None]:
# Verificando a correlação entre as variáveis int
colormap = plt.cm.jet
plt.figure(figsize=(21,16))
plt.title('Correlação de Pearson dados inteiros', y=1.05, size=15)
sns.heatmap(train_int.corr(),linewidths=0.1,vmax=1.0, square=True, cmap='Blues', linecolor='white', annot=False)

In [None]:
# Como podemos observar Não existe nenhuma correlação entre as variáveis ps_calc e por isso vamos removelas
colormap = plt.cm.jet
cotrain = train_int.drop(['id','target', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin'], axis=1)
plt.figure(figsize=(21,16))
plt.title('Correlação de Pearson dados inteiros Excluindo ps_calc', y=1.05, size=12)
sns.heatmap(cotrain.corr(),linewidths=0.1,vmax=1.0, square=True, cmap='Blues', linecolor='white', annot=False)

In [None]:
# Verificando a correlação entre todos os dados
colormap = plt.cm.jet
# train = train.drop(['id', 'target'], axis=1)
plt.figure(figsize=(25,25))
plt.title('Pearson correlation of All the features', y=1.05, size=15)
sns.heatmap(train.corr(),linewidths=0.1,vmax=1.0, square=True, cmap='Blues', linecolor='white', annot=False)

# **One-hot encoding**

In [None]:
# Verificando atributos nomanais para saber valores distintos em cada um
v = meta[(meta.level == 'nominal') & (meta.keep)].index

for f in v:
    dist_values = train[f].value_counts().shape[0]
    print('Atributo {} tem {} valores distintos'.format(f, dist_values))

In [None]:
# Gerando One-hot encoding
v = meta[(meta.level == 'nominal') & (meta.keep)].index
print('Antes do one-hot encoding tinha-se {} atributos'.format(train.shape[1]))
train = pd.get_dummies(train, columns=v, drop_first=True)
print('Depois do one-hot encoding tem-se {} atributos'.format(train.shape[1]))

test = pd.get_dummies(test, columns=v, drop_first=True)
missing_cols = set( train.columns ) - set( test.columns )
for c in missing_cols:
    test[c] = 0
    
train, test = train.align(test, axis=1)

In [None]:
# Verificando Train e Test se tem o mesmo tamanho/formato
print(train.shape)
print(test.shape)

# **Aplicando Modelo de Regressão Linear - com parametro class_weight='balanced'**

In [None]:
# Aplicando a Regressão utilizando o balanceamento do target
X_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']

X_test  = test.drop(['id', 'target'], axis=1)
y_test  = test['target']

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced')
#model = LogisticRegression()

model.fit(X_train, y_train)
model.score(X_test, y_test)

# **Predict**

In [None]:
# Criando o predict
y_pred = model.predict_proba(X_test)[:,1]
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(model.score(X_test, y_test)))

# **Matriz de Confusão**

In [None]:
# Matriz de Confusão
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, model.predict(X_test))
print(confusion_matrix)

In [None]:
# Matriz de Confusão com função do pandas e maior facilidade de entender os resultados
print(pd.crosstab(y_test, model.predict(X_test), rownames= ['Real'], colnames = ['Predito'], margins=True ))

# **Avaliando outros modelos**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

# Test options and evaluation metric
num_folds = 10
seed = 8
scoring = 'accuracy'

X = train.drop(['id','target'], axis=1)
Y = train.target

validation_size = 0.3
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [None]:
models = [('LR', LogisticRegression()), 
          ('LDA', LinearDiscriminantAnalysis()),
          ('CART', DecisionTreeClassifier()),
          ('NB', GaussianNB())]
results = []
names = []
for name, model in models:
    print("Training model %s" %(name))
    model.fit(X_train, Y_train)
    result = model.score(X_test, y_test)
    msg = "Classifier score %s: %f" % (name, result)
    print(msg)
print("----- Training Done -----")

In [None]:
y_pred

In [None]:
#Enviando a previsão para o Kaggle
previsao = pd.DataFrame()
previsao['id'] = test['id']
previsao['target'] = y_pred

In [None]:
previsao.to_csv('previsao.csv',index = False)