In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/allstate-claims-severity/train.csv')
test = pd.read_csv('/kaggle/input/allstate-claims-severity/test.csv')
sample_submission = pd.read_csv('/kaggle/input/allstate-claims-severity/sample_submission.csv')

In [None]:
print("Train data dimensions: ", train.shape)
print("Test data dimensions: ", test.shape)

In [None]:
print(train.columns.tolist())

In [None]:
train.head()

In [None]:
print("Number of missing values",train.isnull().sum().sum())

In [None]:
train.describe()

### Checking skewness and kurtosis to see if normally distributed continuous features
#### The acceptable values are between -1.5 to +1.5

In [None]:
cont_features = train.iloc[:,-15:-1]
cont_features.head()

In [None]:
print(cont_features.skew())

In [None]:
print(cont_features.kurtosis())

### Analysis of loss feature

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(13,9))
sns.boxplot(train["loss"])

In [None]:
plt.figure(figsize=(13,9))
sns.distplot(train["loss"])

#### Loss is highly skewed to the right because there are many outliers in the data as we can see from box plot. So we can use log function to see if we can get a normal distribution.

In [None]:
plt.figure(figsize=(13,9))
sns.boxplot(np.log1p(train["loss"]))

In [None]:
plt.figure(figsize=(13,9))
sns.distplot(np.log1p(train["loss"]))

#### Now we have normal distribution by applying logarithm on loss function and we can train model using the same target feature without removing outliers.

### Convert categorical string values to numeric values

In [None]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
for i in train:
    if 'cat' in i:
        train[i] = enc.fit_transform(train[i])

In [None]:
train.head()

In [None]:
X = train.drop(["id","loss"],axis=1)
Y = train["loss"]

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=10)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
X_test = sc.transform(X_test)

## Evaluation and prediction
 ### XGBoost Regressor

In [None]:
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators=1000)
model.fit(X,Y)

In [None]:
Y_predict = model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
print("r2 score is ", r2_score(Y_predict,Y_test))
print("MAE is ",mean_absolute_error(Y_predict,Y_test))
print("MSE score is ", mean_squared_error(Y_predict,Y_test))

## TEST DATA

In [None]:
test_id = test['id']
test.head()

In [None]:
print("Number of missing values",train.isnull().sum().sum())

In [None]:
enc = LabelEncoder()
for i in test:
    if 'cat' in i:
        test[i] = enc.fit_transform(test[i])

In [None]:
test.head()

In [None]:
test = test.drop(["id"],axis= 1)
test = sc.transform(test)

In [None]:
prediction = model.predict(test)

In [None]:
submission = pd.DataFrame(test_id)
submission['prediction'] = prediction

In [None]:
submission.head()

In [None]:
submission.to_csv('Submission_ACS.csv', index = False)