In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import torch.nn as nn
from torch import optim 
import torch
import torch.utils.data as Data
from sklearn.ensemble import AdaBoostRegressor

In [None]:
train_df = pd.read_csv("../input/allstate-claims-severity/train.csv")
print(train_df.shape)
train_df.head()

In [None]:
loss = train_df["loss"]
features = train_df.drop("loss", axis = 1)
print(loss.shape)
print(features.shape)

### Check for Skewness in the data. 

In [None]:
#Look at cont variables
features.describe()

In [None]:
#Only one variable is over 1 skewness which is pretty good. Feature Data does not need skewness
cont_skewness_dict = dict()
for columns in features.columns:
    if columns[:4] == "cont":
        column_skewness = eval('features["{}"].skew()'.format(columns))
        cont_skewness_dict[columns] = column_skewness
print(cont_skewness_dict)
for value in cont_skewness_dict:
    if cont_skewness_dict[value] > 1 or cont_skewness_dict[value] < -1:
        print("Data skewness over at one at", str(value))
        break

#Loss needs to be logged
print(loss.skew())

In [None]:
#Lets remove outliers in loss
average_loss = np.mean((loss))
maximum_loss = np.max((loss))
print("Average_loss {}".format(average_loss))
print("Maximum_loss {}".format(maximum_loss))



In [None]:
features_train = pd.get_dummies(features)
loss_train = np.log(loss)

X_train, X_test, y_train, y_test = train_test_split(features_train, loss_train, test_size = 0.33)

#Lets run a baseline model
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)
base_predictions = linear_regression.predict(X_test)


print("Mean squared error {}".format(mean_squared_error(np.exp(y_test), np.exp(base_predictions))))
print("Mean absolute error {}".format(mean_absolute_error(np.exp(y_test), np.exp(base_predictions))))



In [None]:
loss_train = np.log(loss)

### Here we are going to use PCA or Principal Component Analysis

In [None]:
#Lets get the columns for cont variables and category variables 
skimmed_features = features.drop("id", axis = 1)
category_features = skimmed_features.iloc[:, :116]
cont_features = skimmed_features.iloc[:, 116:]

In [None]:
print(category_features.shape)
print(cont_features.shape)

In [None]:
pca = PCA(n_components = 9)
cont_reduced = pca.fit_transform(cont_features)
print(cont_reduced.shape)

cont_explained = 0
for value in pca.explained_variance_ratio_:
    cont_explained += value
cont_explained
    

In [None]:
sparse_category_features = pd.get_dummies(category_features)
sparse_category_features.shape

In [None]:
pca = PCA(n_components = 150)
cat_reduced = pca.fit_transform(sparse_category_features)
print(cat_reduced.shape)

cat_explained = 0
for value in pca.explained_variance_ratio_:
    cat_explained += value
cat_explained

In [None]:
#Now we are going to combine the data into one dataframe.
reduced_feature = np.hstack((cat_reduced,cont_reduced))
print("Reduced PCA Dataset: {}".format(reduced_feature.shape))

### Now we are going to try to run some supervised Models on the data

In [None]:
print(type(reduced_feature))
print("Did not use fit_transform with PCA")
print(type(loss_train))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reduced_feature, loss_train, test_size = 0.33)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
def find_scores(pred, y_true):
    print("The Means absolute error: {}".format(mean_absolute_error(np.exp(y_true), np.exp(pred))))
    print("The Mean Squared error: {}".format(mean_squared_error(np.exp(y_true), np.exp(pred))))

In [None]:
print(type(X_train))
print(type(y_train))

In [None]:
#Lets run a polynomial regression

# polynomial_object = PolynomialFeatures(2)
# poly_data = polynomial_object.fit_transform(X_train)

# linear_regression = LinearRegression()
# linear_regression.fit(X_train, y_train)
# polynomial_pred = linear_regression.predict(X_test)

# find_scores(polynomial_pred, y_test)
#Uncomment to use Polynomial Regression. Note: Takes a lot of RAM. 

## Decision Tree Regressor Model

In [None]:
#Now lets run a Decision Tree Regressor except now we will use GridSearchCV to validate the best model
parameters = {"max_depth": [5, 10]}
tree_scoring = make_scorer(mean_squared_error)
tree_regressor = DecisionTreeRegressor()
grid = GridSearchCV(tree_regressor, parameters, scoring = tree_scoring)
grid_fit = grid.fit(X_train, y_train)
best_tree = grid_fit.best_estimator_
print("Best estimator: {}".format(best_tree))

best_tree_fit = best_tree.fit(X_train, y_train)
tree_predictions = best_tree_fit.predict(X_test)

find_scores(tree_predictions, y_test)

## Random Forest Model (Bagging Model)

In [None]:
# parameters = {"n_estimators": [5, 10]}
# random_forest = RandomForestRegressor(max_depth = 5)
# grid = GridSearchCV(random_forest, parameters, scoring = absolute_scoring)
# grid_fit = grid.fit(X_train, y_train)
# best_forest = grid_fit.best_estimator_
# print("Best random Forest{}".format(best_forest))
# print("The run time{}".format(grid_fit.refit_time_))

# best_forest_fit = best_forest.fit(X_train, y_train)
# forest_predictions = best_forest_fit.predict(X_test)

random_forest = RandomForestRegressor(n_estimators = 10, max_depth = 5)
random_forest_fit = random_forest.fit(X_train, y_train)
forest_predictions = random_forest_fit.predict(X_test)

find_scores(forest_predictions, y_test)

## Lasso Model (Regularized Linear Regression)

In [None]:
parameters = {"alpha": [0.5, 1, 2, 5]}
absolute_scoring = make_scorer(mean_absolute_error)
lasso_model = Lasso()
grid = GridSearchCV(lasso_model, parameters, scoring = absolute_scoring)
grid_fit = grid.fit(X_train, y_train)
best_lasso = grid_fit.best_estimator_
print("Best Lasso Model L1 Regularization: {}".format(best_lasso))
print("The run time: {}".format(grid_fit.refit_time_))

best_lasso_fit = best_lasso.fit(X_train, y_train)
lasso_predictions = best_lasso_fit.predict(X_test)

find_scores(lasso_predictions, y_test)

In [None]:
X_train.shape

## AdaBoostRegressor Model

In [None]:
ada_boost = AdaBoostRegressor(n_estimators = 10, learning_rate = 0.5)
ada_boost_fit = ada_boost.fit(X_train, y_train)
ada_boost_predictions = ada_boost.predict(X_test)

find_scores(ada_boost_predictions, y_test)

## Linear Neural Network Pytorch

In [None]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        
        self.fc1 = nn.Linear(159, 30)
        self.fc2 = nn.Linear(30, 1)
        
        self.tanh = nn.Tanh()
        
        
    def forward(self, x):
        
        #x shape (1, 159)
        x = self.tanh(self.fc1(x))
        x = self.fc2(x)
        
        return x
        
    

In [None]:
net = Net()
optimizer = optim.Adam(net.parameters(), lr = 0.01)
criterion = nn.MSELoss()
for epoch in range(151):
    
    epoch_loss = 0
    
    optimizer.zero_grad()
    X_train = torch.FloatTensor(X_train)
    y_train = (torch.FloatTensor(y_train)).view(-1, 1)
    output = net(X_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    
    if epoch % 10 == 0:
        
        print("Epoch: {}".format(epoch))
        print("Loss: {:.3f}".format(np.exp(epoch_loss)))
    

In [None]:
X_test = torch.FloatTensor(X_test)
y_test = (torch.Tensor(y_test.values)).view(-1, 1)
test_output = net(X_test)
loss = criterion(test_output, y_test)
print("Loss: {}".format(torch.exp(loss)))

In [None]:
print(y_test[0:10])

In [None]:
net(X_test[0:10])

## How would you batch data

pandas series --> numpy_array --> torch tensor

``` for (batch_i), images, labels in enumerate(loader): ```

In [None]:
torch_dataset = Data.TensorDataset(X_train, y_train)

loader = Data.DataLoader(
    dataset=torch_dataset, 
    batch_size=32)

In [None]:
iterator = iter(loader)
data, label = iterator.next()

In [None]:
print(data)
print(data.shape)

In [None]:
print(label)
print(label.shape)