In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Data viz. and EDA
import matplotlib.pyplot as plt 
%matplotlib inline  
import plotly.offline as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs,init_notebook_mode,plot, iplot
import plotly.tools as tls
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)

## For scaling data 
from mlxtend.preprocessing import minmax_scaling 

# Tensorflow 
import tensorflow as tf
from tensorflow.keras import callbacks

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# data.info()

In [None]:
data = pd.read_csv("../input/libertymutual/train.csv")
data.head()

In [None]:
test = pd.read_csv("../input/libertymutual/test.csv")
test.head()

In [None]:
data.shape

In [None]:
# T1_V16, T1_V11

# T1_V4 - V9, V12, V15,17
# 3,5,11,12,13

# T1_V1,2,3,10,13,14
# 1,2,4,6-10,14,15

In [None]:
# for i in range(34):
#     print(len(data.iloc[:, i].unique()), data.iloc[:, i].dtype, data.iloc[:, i].name)

In [None]:
#now we see how those value affect others by MI scores
from sklearn.feature_selection import mutual_info_regression

# Utility functions from Tutorial
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    mi_scores = mutual_info_regression(X, y, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [None]:
X = data.copy()
y = X.pop('Hazard')
X.pop('Id')

# mi_scores = make_mi_scores(X, y)

In [None]:
print(X.shape)

In [None]:
# print(mi_scores)

In [None]:
def correlation_plot():
    #correlation
    correlation = X.corr()
    #tick labels
    matrix_cols = correlation.columns.tolist()
    #convert to array
    corr_array  = np.array(correlation)
    trace = go.Heatmap(z = corr_array,
                       x = matrix_cols,
                       y = matrix_cols,
                       colorscale='Viridis',
                       colorbar   = dict() 
                      )
    layout = go.Layout(dict(title = 'Correlation Matrix for variables',
                            #autosize = False,
                            #height  = 1400,
                            #width   = 1600,
                            margin  = dict(r = 0 ,l = 100,
                                           t = 0,b = 100,
                                         ),
                            yaxis   = dict(tickfont = dict(size = 9)),
                            xaxis   = dict(tickfont = dict(size = 9)),
                           )
                      )
    fig = go.Figure(data = [trace],layout = layout)
    py.iplot(fig)


In [None]:
# 1 - 16, 12, 10, 6
# 2 - 7, 10, 11, 12

In [None]:
# T1_V4, 5, 7, 8 V9, V15,17
# 3,5,13

# T1_V1,2,3,13,14
# 1,2,4,6,8,9,14,15

In [None]:
# correlation_plot()

In [None]:
# T1_V4, 5, 7, 8 V9, V15,17
# 3,5,13

# T1_V1,2,3,13,14
# 1,2,4,6,8,9,14,15

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

feature_nums = ['T1_V1', 'T1_V2', 'T1_V3', 'T1_V13', 'T1_V14', 'T2_V1', 'T2_V2', 'T2_V4', 'T2_V6', 'T2_V8',
               'T2_V9', 'T2_V14', 'T2_V15']
feature_objs = ['T1_V4', 'T1_V5', 'T1_V7', 'T1_V8', 'T1_V9', 'T1_V15', 'T1_V17', 'T2_V3', 'T2_V5', 'T2_V13']

In [None]:
all_cols = feature_nums + feature_objs
X = data[all_cols].copy()
X.head()

In [None]:
X.shape

In [None]:
len(y)

In [None]:
# We will use 80% of data to train and 20% to validation
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
# one-hot those object-type value 
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)

In [None]:
# fill NaN value with 0, it's only occur in one-hot cols
X_train = X_train.fillna(0)
X_valid = X_valid.fillna(0)
# Check the shape again
print(X_train.shape)
print(X_valid.shape)
print(len(y_train))
print(len(y_valid))

In [None]:
# use XGBoost to predict
from xgboost import XGBRegressor
# Define the model
my_model_1 = XGBRegressor(n_estimators = 1000, learning_rate = 0.05) # Your code here

# Fit the model
my_model_1.fit(X_train, y_train,
              early_stopping_rounds=100,
              eval_set =[(X_valid, y_valid)],
              verbose=False) # Your code here

# Get predictions
predictions_1 = my_model_1.predict(X_valid) # Your code here

# Calculate MAE
mae_1 = mean_absolute_error(predictions_1, y_valid) # Your code here
print("Mean Absolute Error by XGBoost: " , mae_1)

# Calculate r2
r2_score_1 = r2_score(y_valid,predictions_1)
print("R2 Score by XGBoost: ", r2_score_1)

In [None]:
# Mean Absolute Error by XGBoost:  2.7116258179206474
# R2 Score by XGBoost:  0.12112721344092536

In [None]:
from sklearn.ensemble import RandomForestRegressor
def score_model(n_est):
    my_model_2 = RandomForestRegressor(n_estimators=n_est, random_state=0)
    my_model_2.fit(X_train, y_train)
    pre = my_model_2.predict(X_valid)
    print("Done {} round".format(n_est))
    return  mean_absolute_error(pre, y_valid), r2_score(y_valid, pre)

In [None]:
# n_estimators value affect a lots in result, so we check which n_est is the best for our data
# mea_scores = {}
# r2_scores = {}
# for i in range(14,21): #14 - 20
#     mea_scores[i*10], r2_scores[i*10] = score_model(i*10)

In [None]:
# import matplotlib.pyplot as plt
# %matplotlib inline

# plt.plot(list(mea_scores.keys()), list(mea_scores.values()))
# plt.show()

In [None]:
# plt.plot(list(r2_scores.keys()), list(r2_scores.values()))
# plt.show()

In [None]:
X_test = test[all_cols].copy()
X_test.head()

In [None]:
X_test = pd.get_dummies(X_test)
X_test.fillna(0)
X_test.shape

In [None]:
predictions_last = my_model_1.predict(X_test) 

In [None]:
# print(pre)

In [None]:
pre = predictions_last.astype(int)

In [None]:
# Save test predictions to file
output = pd.DataFrame({'Id': test.Id,
                       'Hazard': pre})
output.to_csv('submission.csv', index=False)

In [None]:
test_re = pd.read_csv("./submission.csv")
test_re.head()

In [None]:
sam = pd.read_csv("../input/libertymutual/sample_submission.csv")
sam.head()