In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


import seaborn as sns
# import xgboost as xgb

import matplotlib.pyplot as plt


import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings as ws
ws.defaultaction = "ignore"

In [None]:
arr = np.linspace(0.0, 10.0, 6).reshape(2, 3)
print(arr)

arr[0, 1]

In [None]:
data = pd.read_csv('../input/insurance/insurance.csv')

## Studying and Cleaning Data 

In [None]:
data

In [None]:
print('Data shape: ',data.shape, '\n')
print('*******************************')
print('Data means:\n',data.mean(), '\n')
print('*******************************')
print('Data features count:\n',data.count(), '\n')
print('*******************************')
print('Data Info about null vals:\n',data.info(), '\n')
print('*******************************')
print('Data Features null vals:\n',data.isnull().sum(), '\n')

In [None]:
# Insurance charges histogram (How good is its skew value?)
plt.figure(figsize=(10, 8))
plt.hist(data['charges'], bins = 50 ,color='#3f4c6b', ec='#606c88')
plt.title('Insurance charges in $ vs Nr. of people', fontsize=18)
plt.ylabel('Nr. of people', fontsize=14)
plt.xlabel('Prices in $', fontsize=14)
plt.show()

## Data correlation

## $$ \rho _{XY} = corr(X, Y) $$

## $$ -1.0 \leq \rho _{XY} \leq +1.0$$

In [None]:
# Changing "sex" feature to 0s and 1s => 0s: female; 1s: male
data['sex'] = data.sex.replace({"female" :0, "male" : 1 })

# Changing"smoker" features to 0s and 1s => 0s: no; 1s: yes
data['smoker'] = data.smoker.replace({"yes": 1, "no" : 0 })

data['region'] = data.region.replace({"southeast": 0, "southwest" : 1,
                                     "northeast":2, "northwest":3})


In [None]:
# Extracting relevant data and ignoring repetitive correlations
mask = np.zeros_like(data.corr())
triangle_indices = np.triu_indices_from(mask)
mask[triangle_indices] = True
data.corr()

In [None]:
# Correlations value graph
plt.figure(figsize=(10, 8))

sns.heatmap(data.corr(), mask=mask, annot=True, annot_kws={"size":14})

#Analysis: We can clearly notice that there is a strong correlation between the age and the charges

In [None]:
sns.set_context('talk')
sns.set_style('darkgrid')
g = sns.FacetGrid(data, row="smoker", col="sex", margin_titles=True, height=5, )
g.map(sns.regplot, "bmi", "charges", color="#12c2e9", x_jitter=.1, line_kws={"color": "#f64f59"})

In [None]:
region_charges = sns.catplot(x="region", y='charges', data=data, legend_out = False,
            height=8, hue="sex", kind='bar', palette=["#f64f59", "#12c2e9"]);

# region_charges.set_title('Region vs. Charges by gender')
leg = region_charges.axes.flat[0].get_legend()
region_charges.set(xlabel='Regions', ylabel='Charges', 
                   title='Regions vs. Insurance Charges')

region_charges.set_xticklabels(['Southeast','Southwest','Northeast','Northwest'])


leg.set_title('Gender')
new_labels = ['Felmale', 'Male']
for t, l in zip(leg.texts, new_labels): t.set_text(l)
plt.show()



child_charges = sns.catplot(x="children", y='charges', data=data, height=8, legend_out = False,
           kind='bar', palette=["#aa4b6b", "#3b8d99"]);

child_charges.set(xlabel='# of Children', ylabel='Charges', 
                   title='Nr. of Children vs. Insurance Charges')



# Training & Test Datasets Split

In [None]:
charges = data['charges']
features = data.drop(['charges'], axis=1) #Dropping charges collumn

X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    charges, 
                                                    test_size= 0.2, 
                                                    random_state=42)

regression = LinearRegression()
model = regression.fit(X_train, y_train)
prediction = regression.predict(X_test)

print('Test Data r-Squared score: ', regression.score(X_test, y_test))
print('Train Data r-Squared score: ', regression.score(X_train, y_train))
print(X_train, y_train)

pd.DataFrame(data=regression.coef_, index=X_train.columns, columns=['coef'])

# Pre-transformation skew val
pre_trans = round(data['charges'].skew(), 3)
print('Pre-transformation skew val: ', pre_trans)
sns.distplot(data['charges'])
plt.title(f'Original Charges with skew {pre_trans}')
plt.show()

# Data Transformation

In [None]:
# Post-transformation skew val
post_trans = round(np.log(data['charges'].skew()), 3)
print('Post-transformation skew val: ', post_trans)

y_log = np.log(data['charges'])
sns.distplot(y_log)
plt.title(f'Log Charges with skew {post_trans}')

In [None]:
# Apply the transformation.
log_charges = np.log(data['charges'])

transformed_data = data.drop('charges', axis=1)


X_train, X_test, y_train, y_test = train_test_split(transformed_data, 
                                                    log_charges, 
                                                    test_size= 0.2, 
                                                    random_state=42)

regression_t = LinearRegression()
model_t = regression_t.fit(X_train, y_train)
prediction_t = regression_t.predict(X_test)

pd.DataFrame(data=regression_t.coef_, index=X_train.columns, columns=['coef'])

plt.scatter(y_test, prediction_t)
plt.plot(y_test, y_test, color='red')
rmse = np.sqrt(mean_squared_error(y_test, prediction_t))


print('Intercept: ', regression_t.intercept_)
print('Coef: ', regression_t.coef_)
print('rmse: ', rmse)
print('Test Data r-Squared score: ', regression_t.score(X_test, y_test))
print('Train Data r-Squared score: ', regression_t.score(X_train, y_train))




## Calculating P-values

In [None]:
x_include_const = sm.add_constant(X_train) #Adding an intercept

model = sm.OLS(y_train, x_include_const) 
results = model.fit()


# Graph of Actual vs. Predicted Prices
plt.figure(figsize=(10, 8))
corr = round(y_train.corr(results.fittedvalues), 2)
plt.scatter(x=y_train, y=results.fittedvalues, c='black', alpha=0.6)
plt.plot(y_train, y_train, color='cyan')

plt.xlabel('Actual log prices $y _i$', fontsize=14)
plt.ylabel('Predicted log prices $\hat y _i$', fontsize=14)
plt.title(f'Actual vs Predicted log prices $y _i$ vs $\hat y _i$ (Corr: {corr})', 
          fontsize=18)

plt.show()


pd.DataFrame({'Coef' : results.params, 
             'P-values' : round(results.pvalues, 3)})
#Hence, all the features are statistically significant

In [None]:
# Function mse(y, y_hat)

def mse(y, y_hat):
    #calc = (1/y.size) * sum((y - y_hat)**2)
    mse_calc = np.average((y - y_hat)**2, axis=0)
    return mse_calc

In [None]:
X_train

In [None]:
# Apply the transformation.
log_charges2 = np.log(data['charges'])

transformed_data2 = data.drop('charges', axis=1)


X_train, X_test, y_train, y_test = train_test_split(transformed_data2, 
                                                    log_charges2, 
                                                    test_size= 0.2, 
                                                    random_state=42)

In [None]:
print(len(y_train))

In [None]:
type(y_train)

In [None]:
X_train.shape

In [None]:
class LinReg:
    def __init__(self, n_iters = 1000, learning_rate =.0001):
        self.n_iters = n_iters
        self.learning_rate = learning_rate
   
    def linear_regression(self, X, y):
        m_current = np.zeros(6)
        N = len(y)
        for i in range(self.n_iters):
            y_current = np.dot(X, m_current) ##same as using x_test
            cost = sum((y_current - y)**2) / (2*N)
            m_gradient = np.zeros(6)

            for i in range(6):
                for j in range(N):
                    m_gradient[i] = 1/N * (y_current[j] - y[j]) * X[j][i]
            m_current = m_current - (self.learning_rate * m_gradient)


      
        # print(y_current)
        return m_current, cost
    
    


In [None]:
#Lets import the class

regressor = LinReg()
#print(regressor)
print(regressor.linear_regression(X_train.to_numpy(), np.array(y_train)))


In [None]:
def linear_regression(X, y, n_iters = 1000, learning_rate =.001):
    m_current = np.zeros(6)
    N = len(y)
    for i in range(n_iters):
        y_current = np.dot(X, m_current) ##same as using x_test
        cost = sum((y_current - y)**2) / (2*N)
        m_gradient = np.zeros(6)
        
        for i in range(6):
            for j in range(N):
                m_gradient[i] = 1/N * (y_current[j] - y[j]) * X[j][i]
        m_current = m_current - (learning_rate * m_gradient)
        
     
            
    print(y_current)
    return m_current, cost


In [None]:
linear_regression(X_train.to_numpy(), np.array(y_train))


In [None]:
print(m_.shape, c)
m_ = m_.reshape(-1, 1)


In [None]:
y_pred = np.dot(X_test, m_) ##same as using x_test

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(X_test, y_test)

In [None]:
print(m_)

In [None]:
print(y_test)