# TAREFA DATASET KAGGLE


Para esta fase, o objetivo é alcançar o melhor resultado possível com o dataset da competição fornecido pelos docentes na previsão do nível de incidentes rodoviários, numa determinada hora, na cidade de Guimarães.

### **1.** Importar as bibliotecas essenciais do Python para a elaboração desta tarefa

In [None]:
import sklearn as skl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.set_option('display.max_columns', None)

### **2.** Carregar o dataset para um dataframe da biblioteca Pandas

In [None]:
df = pd.read_csv('docs/training_data.csv', encoding="utf-8", skipinitialspace=True)
df_teste = pd.read_csv('docs/test_data.csv', encoding="utf-8", skipinitialspace=True)

### **3.** Obtenção de informação acerca do dataset: tipos de dados das features, conteúdo do dataset e estatística

* **tipos de dados das features**

In [None]:
df.info()

* **conteúdo do dataset**

In [None]:
df.head()

* **estatística**

In [None]:
df.describe()

Distribuição da feature target "incidents"

In [None]:
df["incidents"].hist()

Análise dos valores únicos no dataset

In [None]:
for c in df:
    print(f"{c}: {df[c].unique()}")
    print(f"Quantidade: {df[c].nunique()}")
    print("---------------------------------------")

TODO: tabelinha gira da análise da tabela acima

### **4.** Preparação dos dados

In [None]:
df.head()

In [None]:
df.columns

**Remoção da feature "avg_precipitation"**

In [None]:
#Possui informação incompleta
df = df.drop('avg_precipitation', axis=1)
df_teste = df_teste.drop('avg_precipitation', axis=1)

**Transformação da coluna *record_date* nas colunas ano, mês, dia, hora e minuto**

In [None]:
df['record_date'] = pd.to_datetime(df['record_date'],format='%Y-%m-%d %H:%M', errors='coerce')
assert df['record_date'].isnull().sum() == 0,'missing record date'

df['record_date_year'] = df['record_date'].dt.year
df['record_date_month'] = df['record_date'].dt.month
df['record_date_day'] = df['record_date'].dt.day
df['record_date_hour'] = df['record_date'].dt.hour
df['record_date_minute'] = df['record_date'].dt.minute
df = df.drop('record_date', axis=1)

df.head()


df_teste['record_date'] = pd.to_datetime(df_teste['record_date'],format='%Y-%m-%d %H:%M', errors='coerce')
assert df_teste['record_date'].isnull().sum() == 0,'missing record date'

df_teste['record_date_year'] = df_teste['record_date'].dt.year
df_teste['record_date_month'] = df_teste['record_date'].dt.month
df_teste['record_date_day'] = df_teste['record_date'].dt.day
df_teste['record_date_hour'] = df_teste['record_date'].dt.hour
df_teste['record_date_minute'] = df_teste['record_date'].dt.minute
df_teste = df_teste.drop('record_date', axis=1)

**Remoção das features "city_name", "record_date_year" e "record_date_minute"**

In [None]:
#como apenas existe um valor para "city_name", esta coluna pode ser removida
df = df.drop('city_name', axis=1)
df_teste = df_teste.drop('city_name', axis=1)

#o mesmo se aplica para o ano e minuto
df = df.drop('record_date_year', axis=1)
df = df.drop('record_date_minute', axis=1)

df_teste = df_teste.drop('record_date_year', axis=1)
df_teste = df_teste.drop('record_date_minute', axis=1)

**Transformação da feature "magnitude_of_delay" em valores numéricos**

In [None]:
df['magnitude_of_delay'].unique()

In [None]:
replace_map = {'magnitude_of_delay': {'UNDEFINED':0 ,'MODERATE': 1, 'MAJOR': 2}}
df.replace(replace_map,inplace=True)
df_teste.replace(replace_map,inplace=True)
df.head()

**Tratamento da feature "affected_roads"**

* Tratamento dos missing values

In [None]:
df['affected_roads'].fillna(',',inplace=True) #tratar de missing values
df_teste['affected_roads'].fillna(',',inplace=True) #tratar de missing values
df.info()

* Separação das ruas e remoção de ruas repetidas

In [None]:
affected_roads = {}

#Separação das ruas e remoção de ruas repetidas
for row in df['affected_roads']:
    roads = list(dict.fromkeys(row.split(',')))
    for road in roads:
        affected_roads[road] = []

for row in df['affected_roads']:
    roads = list(dict.fromkeys(row.split(',')))
    for road in affected_roads:
        if road in roads:
            affected_roads[road].append(1)
        else:
            affected_roads[road].append(0)

        
affected_roads = pd.DataFrame.from_dict(affected_roads)
df = df.drop('affected_roads',axis=1)

for column in affected_roads:
    df[column] = affected_roads[column]
    
#######################################################################
affected_roads = {}

for row in df_teste['affected_roads']:
    roads = list(dict.fromkeys(row.split(',')))
    for road in roads:
        affected_roads[road] = []

for row in df_teste['affected_roads']:
    roads = list(dict.fromkeys(row.split(',')))
    for road in affected_roads:
        if road in roads:
            affected_roads[road].append(1)
        else:
            affected_roads[road].append(0)

        
affected_roads = pd.DataFrame.from_dict(affected_roads)
df_teste = df_teste.drop('affected_roads',axis=1)

for column in affected_roads:
    df_teste[column] = affected_roads[column]
    
df

**Transformação da feature "luminosity" em valores numéricos**

In [None]:
df['luminosity'].unique()

In [None]:
replace_map = {'luminosity': {'LOW_LIGHT':0 ,'LIGHT': 1, 'DARK': 2}}
df.replace(replace_map,inplace=True)
df_teste.replace(replace_map,inplace=True)
df.head()

**Tranformação da feature "avg_rain" em valores numéricos**

In [None]:
df['avg_rain'].unique()

In [None]:
replace_map = {'avg_rain': {'Sem Chuva':0 ,'chuva fraca': 1, 'chuva moderada': 2,'chuva forte' : 3}}
df.replace(replace_map,inplace=True)
df_teste.replace(replace_map,inplace=True)
df.head()

**Transformação da feature "incidents" em valores numéricos**

In [None]:
replace_map = {'incidents': {'None':0 ,'Low': 1, 'Medium': 2, 'High': 3, 'Very_High': 4}}
df.replace(replace_map,inplace=True)
df_teste.replace(replace_map,inplace=True)
df.head()

#### Adição de feature "dayOfWeek"

In [None]:
import datetime

def getDayofWeek(df):
    mes = int (df['record_date_month'])
    dia = int (df['record_date_day'])
    intDay = datetime.date(year=2021, month=mes, day=dia).weekday()
    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    return days[intDay]

df['dayOfWeek'] = df.apply(getDayofWeek, axis=1)

df

### 5. Aplicação de modelos de machine learning

#### 5.1. Decision Tree Classifier

Imports necessários para a implementação deste modelo

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report, plot_confusion_matrix
from sklearn import tree

In [None]:
x = df.drop("incidents", axis = 1)

In [None]:
y = df.incidents

In [None]:
x_train, x_test, y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=2022)

Treino do modelo

In [None]:
clf = DecisionTreeClassifier(random_state=2022)
clf = clf.fit(x_train,y_train)

Visualização da árvore de decisão

In [None]:
tree.export_graphviz(clf,
                     out_file="tree_dsCompeticao.dot",
                     filled = True)

from graphviz import Source
Source.from_file("tree_dsCompeticao.dot")

Geração de previsões

In [None]:
predictions = clf.predict(x_test)
pd.DataFrame(predictions)

In [None]:
y_test

Avaliação do modelo

In [None]:
plot_confusion_matrix(clf, x_test, y_test) 

In [None]:
print(classification_report(y_test, predictions))

In [None]:
mean_absolute_error(y_test,predictions)

**Geração do ficheiro de submissão para o Decision Tree**

In [None]:
#Função genérica que irá ser reutilizada para os proximos modelos
def download_submission_file(model, df_teste, filename):
    predictions_dfT = model.predict(df_teste)
    pd.DataFrame(predictions_dfT)
    
    replace_map = {'Incidents': {"0":'None' ,"1":'Low', "2":'Medium', "3":'High', "4":'Very_High'}}
    
    predictions_dfT = pd.DataFrame(predictions_dfT, columns = ["Incidents"])
    predictions_dfT["Incidents"] = predictions_dfT["Incidents"].astype(str)
    predictions_dfT.replace(replace_map,inplace=True)
    predictions_dfT.index+=1
    predictions_dfT.head()
    
    from pathlib import Path
    filepath = Path(filename)
    predictions_dfT.to_csv(filepath, index = True)

In [None]:
download_submission_file(clf, df_teste, "submission_files/decisionTree.csv")

#### 5.2 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [None]:
x = df.drop(['incidents'], axis=1)
x

In [None]:
y = df[['incidents']]
y

In [None]:
scaler_X = MinMaxScaler(feature_range=(0, 1)).fit(x)
scaler_y = MinMaxScaler(feature_range=(0, 1)).fit(y)
x_scaled = pd.DataFrame(scaler_X.transform(x[x.columns]), columns=x.columns)
y_scaled = pd.DataFrame(scaler_y.transform(y[y.columns]), columns=y.columns)

In [None]:
x_train, x_test, y_train,y_test = train_test_split(x_scaled,y_scaled,test_size=0.25,random_state=2022)

In [None]:
lr = LogisticRegression(solver="newton-cg", random_state=2022) 

In [None]:
replace_map = {'incidents': {"0.0":'None' ,"0.25":'Low', "0.5":'Medium', "0.75":'High', "1.0":'Very_High'}}

y_train = pd.DataFrame(y_train, columns = ["incidents"])
y_train["incidents"] = y_train["incidents"].astype(str)
y_train.replace(replace_map,inplace=True)
y_train.index+=1
y_train

In [None]:
lr.fit(x_train,y_train.values.ravel())

In [None]:
download_submission_file(lr, df_teste, "submission_files/logisticRegression.csv")

#### 5.3 Support Vector Machine

In [None]:
x = df.drop(['incidents'], axis=1)

y = df['incidents']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=2022)

In [None]:
from sklearn.svm import SVC

In [None]:
model = SVC(random_state=2022)
model.fit(x_train,y_train)

In [None]:
download_submission_file(model, df_teste, "submission_files/svm.csv")

#### 5.4 Redes neuronais

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras import optimizers
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
x = df.drop('incidents',axis=1)
y = df[['incidents']]

In [None]:
scaler_X = MinMaxScaler(feature_range=(0, 1)).fit(x)
scaler_y = MinMaxScaler(feature_range=(0, 1)).fit(y)
x_scaled = pd.DataFrame(scaler_X.transform(x[x.columns]), columns=x.columns)
y_scaled = pd.DataFrame(scaler_y.transform(y[y.columns]), columns=y.columns)

In [None]:
x_scaled.head()

In [None]:
y_scaled.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y_scaled,test_size=0.2,random_state=2022)

In [None]:
def build_model(activation = "relu", learning_rate = 0.01):
    #Create a sequential model (with three layers - last one is the output)
    model = Sequential()
    model.add(Dense(16, input_dim = 5, activation = activation))
    model.add(Dense(8, activation = activation))
    model.add(Dense(1, activation = "relu"))
    
    #Compile the model
    #Define the loss function, the otimizer and metrics to be used
    model.compile(
        loss = "mae",
        optimizer = optimizers.Adam(learning_rate),
        metrics = ["mae", "mse"])
    return model

In [None]:
model = build_model()
model.summary()

In [None]:
TUNING_DICT = {
    "activation" :    ["relu", "sigmoid"],
    "learning_rate" : [0.01, 0.001]
}

In [None]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 2022)

model = KerasRegressor(build_fn = build_model, epochs = 20, batch_size = 32)
grid_search = GridSearchCV(estimator = model,
                           param_grid = TUNING_DICT,
                           cv = kf,
                           scoring = "neg_mean_absolute_error",
                           refit = "True",
                           verbose = 1)

grid_search.fit(x_train, y_train, validation_split = 0.2) 

In [None]:
#summarize results
print("Best: %f using %s" %(grid_search.best_score_, grid_search.best_params_))
means = grid_search.cv_results_["mean_test_score"]
stds = grid_search.cv_results_["std_test_score"]
params = grid_search.cv_results_["params"]
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" %(mean, stdev, param))

In [None]:
#Our best model (remember we set refit=True?)
best_mlp_model = grid_search.best_estimator_

In [None]:
from livelossplot import PlotLossesKerasTF

In [None]:
best_mlp_model.fit(x_train, y_train, epochs = 20,
                   validation_data = (x_test, y_test),
                   callbacks = [PlotLossesKerasTF()], verbose = 1)

In [None]:
#Obtain predictions
predictions = best_mlp_model.predict(x_test)
predictions = predictions.reshape(predictions.shape[0], 1)
predictions[:5]

In [None]:
#And now let's unscale the model's predictions to see real prices!
predictions_unscaled = scaler_y.inverse_transform(y_test)
predictions_unscaled[:5]

In [None]:
#Let's unscale y_test to get the original values
y_test_unscaled = scaler_y.inverse_transform(predictions)
y_test_unscaled[:5]

In [None]:
from sklearn import metrics
print("MAE:", metrics.mean_absolute_error(y_test, predictions))
print("MSE:", metrics.mean_squared_error(y_test, predictions))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
download_submission_file(best_mlp_model, df_teste, "submission_files/redeNeuronal.csv")

#### 5.4 Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
x = df.drop(['incidents'], axis=1)

y = df['incidents']

In [None]:
#scaler_X = MinMaxScaler(feature_range=(0, 1)).fit(x)
#scaler_y = MinMaxScaler(feature_range=(0, 1)).fit(y)
#x_scaled = pd.DataFrame(scaler_X.transform(x[x.columns]), columns=x.columns)
#y_scaled = pd.DataFrame(scaler_y.transform(y[y.columns]), columns=y.columns)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2,random_state=2022)

In [None]:
rfc = RandomForestClassifier(random_state = 2022)

In [None]:
rfc.fit(x, y)

In [None]:
download_submission_file(rfc, df_teste, "submission_files/randomForestClassifier.csv")

## TODO:
* Verificar Logistic Regression
* Verificar Redes Neuronais