In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt 
import seaborn as sns
sns.set(color_codes = True)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.linear_model import LinearRegression, Ridge

from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import make_pipeline

### Lets first check how does our simple multivaraiate linear regression performs in comparision to scikitlearn linear regression. 

In [None]:
# initialise some random values. 

X  = np.random.rand(15,5)
Y = np.random.rand(15,1)
Theta = np.zeros((X.shape[1],1))

# We dont need to standarise these values as they np.random.rand intitialises uniform distribution between 0 & 1. 

In [None]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
 X, Y, test_size=0.20, random_state=42)

In [None]:
reg_1 =LinearRegression().fit(X_train,y_train)

In [None]:
#Stats for scklearn LR.

#high is better
print(reg_1.score(X_train,y_train))
#Low is better
print(mean_absolute_error(reg_1.predict(X_test),y_test))

In [None]:
#Our model:

alpha = 0.03
m = len(X_train)
J = []
for i in range(1,15000):
    hx  = np.dot(X_train,Theta)                          # m,1  
    error = hx - y_train                                 # m,1
    grad = np.dot(error.T,X_train)                       # 1,m * m,n = 1,n
    Theta = Theta - ((alpha/m) * grad.T)
    J.append(1/(2*m) * (sum((error)**2)))          # 1,1   
    

In [None]:
v = sum(((y_train - np.mean(y_train))**2))
u = sum((error)**2)

print(f'this is our model score {1 - u/v}')

# predict cost on test set
m = len(X_test)
hx  = np.dot(X_test,Theta)                          # m,1  
print(f'this is our mae {mean_absolute_error(hx,y_test)}') 

# conclusion

1. Scklearn gives slightly better results. 
2. It gives more stable results.
3. Is faster than  becasue our model uses loop for gradient descent. I suspect, scklearn is using normal equation for weight updates. You can check the source code for conformation. 
4. Hence proven that using scklearn linear-regression is a better choice then your own implementaion unless you get better results in the test phase. 

## Lets Implement Multivariate Linear regression on insurance data-set

In [None]:
X = pd.read_csv("/kaggle/input/insurance/insurance.csv")

In [None]:
X.describe()

In [None]:
X = X.drop_duplicates()
X.describe()

In [None]:
X.plot(kind='box', subplots=True, layout=(2,2),
sharex=False, sharey=False, figsize=(10,10))
plt.show()



In [None]:
X.hist(figsize=(10,10))

In [None]:
sns.pairplot(X)

In [None]:
# somkers are charged more

X[X.smoker=="yes"].hist(figsize=(7,7))
X[X.smoker=="no"].hist(figsize=(7,7))

In [None]:
#smoker has strong correlation
plt.figure(figsize=(10,10))
c = pd.get_dummies(X).corr()
sns.heatmap(c,cmap="BrBG",annot=True)

## Seems sex and region are not directly correlated with charges. Lets see what are their coeficients inorder to check how important they are.  

# Lets Build the model;

In [None]:
X = pd.get_dummies(X,drop_first=True)
Y = X.charges
X = X.drop(["charges"],1)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns = X.columns)



In [None]:
X_train, X_test, y_train, y_test = train_test_split(
 X, Y, test_size=0.2, random_state=42)

### Feature Selection

In [None]:
from sklearn.linear_model import Ridge
model = Ridge()
model.fit(X_train, y_train)
importance = model.coef_
feat_importances = pd.Series(model.coef_, index=X.columns)
feat_importances.plot(kind='barh')
plt.show()

In [None]:
feat_importances

### The coefficient value signifies how much the mean of the dependent variable changes given a one-unit shift in the independent variable while holding other variables in the model constant. 

### 1. This implies features with Coeffiecients near 0. Are not that important while fitting our model and can be avoided, inorder to keep our hx(theta) general and not let it overfit features which dont compensate much.

In [None]:
#apply SelectKBest class to extract top 10 best features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
bestfeatures = SelectKBest(score_func=f_regression, k=6)
fit = bestfeatures.fit(X_train,y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(8,'Score'))  #print 10 best features

In [None]:
## check using p.values

from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

X2 = sm.add_constant(X_train)

est = sm.OLS(y_train.values, X2.values)
est2 = est.fit()
print(est2.summary())


In [None]:
X2

### This confirms that, only four features are important, rest we will drop so that our model does not overfit. 

In [None]:
relevent_features = ['age','bmi','children','smoker_yes']

X_train = X_train[relevent_features]
X_test = X_test[relevent_features]

In [None]:
model_score = []
R2_score = []
mae = []
for i in range(1,8):
    polyreg=make_pipeline(PolynomialFeatures(i,include_bias=True),LinearRegression())
    polyreg.fit(X_train,y_train)
    model_score.append(polyreg.score(X_train,y_train))
    R2_score.append(r2_score(polyreg.predict(X_test),y_test))
    mae.append(mean_absolute_error(polyreg.predict(X_test),y_test))
    
    

In [None]:
print(f'this is the r2_score of test set: {R2_score}')
print(f'This is mean absolute error of the test set: {mae}')
print(f'this is training model score: {model_score}')

Conclusion:

1. Its clear as the degree of polynomial increases our train model overfits --> high varience. Then performs worse on test set. 
2. Quadratic polynomial linear regression fits our data best. As R2  is highest and MAE is also econd highest for the respective test sets. 

In [None]:


from sklearn.model_selection import validation_curve, learning_curve

def draw_learning_curve(model, x, y):
    train_sizes,train_scores, test_scores = learning_curve(model, x, y,train_sizes=[50, 100, 300, 500, 700, 800,900], cv=10)
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    
    plt.plot(train_sizes, train_scores_mean, color='blue', label='Train score')
    plt.plot(train_sizes, test_scores_mean, color='red', label='Cross-validation score')
    
    plt.legend(loc='best')
    plt.xlabel('Training size')
    plt.ylabel('score')



In [None]:
for i in range(1,5):
    polyreg=make_pipeline(PolynomialFeatures(i,include_bias=True),LinearRegression())
    draw_learning_curve(polyreg,X_train, y_train)
    plt.title(f"Learning curve for {i}-degree poly Regressor")
    plt.show()

In [None]:
for i in range(1,8):

    polyreg=make_pipeline(PolynomialFeatures(i,include_bias=True),LinearRegression())
    polyreg.fit(X_train,y_train)
    y_pred_pr = polyreg.predict(X_test)

    predTest = pd.DataFrame({"prediction": y_pred_pr, "observed": y_test})
    plt.scatter(predTest['prediction'], predTest['observed'])
    plt.title("Polynomial Regressor: Prediction Vs Actual Data")
    plt.xlabel("Predicted Medical Charges") 
    plt.ylabel("Observed Medical Charges")
    plt.show()


## Now, you can visualise and see it yourself which is the best n-degree polynomial function. 

## Upvote if you like !! Cheers!!