In [1]:
import numpy as np # numpy module for linear algebra
import pandas as pd # pandas module for data manipulation
import matplotlib.pyplot as plt # module for plotting
from sklearn.model_selection import train_test_split #Splitting data into test and train sets
from sklearn.metrics import mean_squared_error #as the name says, to find the mean squared error value
from sklearn.preprocessing import StandardScaler # used to scale the data
import math #For computing math functions
from sklearn.linear_model import SGDRegressor # Regress for Stochastic Gradient Descent algo
from IPython.display import Markdown as md #For displaying results in md
from scipy import stats # module For removing outliers in this program.


In [2]:
# import csv file using pandas
df = pd.read_csv ('Data_miniproject.csv');
# remove cost feature 0 valued rows 
df = df[df.Cost != 0]

# We are gonna eliminate outliers using z-scores from Scipy library.
z_scores = stats.zscore(df)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
df = df[filtered_entries]

# Taking  dependent variables as X and independent variable(COST) as y
y = df.pop('Cost');
X = df.copy()

# Scaling the Data so that computations will be much faster and also easily understood by machine while computing
column_names = X.columns
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X),columns=column_names)

In [3]:
# Fitting the data using SGDRegressor
sgd_reg = SGDRegressor(max_iter=10_000,tol=0.001,eta0=1e-3)
sgd_reg.fit(X,y)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.001, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=10000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [4]:
# Estimating the Modal
dash = '-' * 90
print(dash)
print("Co_effecient = ",sgd_reg.coef_)
print("Intercept = ",sgd_reg.intercept_)

# Calculating Errors
sgd_mse = mean_squared_error(sgd_reg.predict(X),y)
sgd_rmse = math.sqrt(sgd_mse)
sgd_rse = (sgd_rmse**2)*X.shape[0];
sgd_rse /= X.shape[0]-2
sgd_rSquare = sgd_reg.score(X,y);

# Displaying Errors and score
print(dash)
print('{:>10} {:>20} {:^35} {:^10} '.format("MSE","RMSE","RSE","R SQUARE"))
print(dash)
print('{:<20}| {:>14} | {:^22}| {:^22} '.format(sgd_mse,sgd_rmse,sgd_rse,sgd_rSquare))
print(dash)

------------------------------------------------------------------------------------------
Co_effecient =  [ 59.96045947  47.83980657  17.93051789  64.10710167 129.89144565]
Intercept =  [377.64696749]
------------------------------------------------------------------------------------------
       MSE                 RMSE                 RSE                  R SQUARE  
------------------------------------------------------------------------------------------
10615.75346467064   | 103.03277859337115 |   10754.521483816663  |   0.8950972090263911   
------------------------------------------------------------------------------------------


### Conclusion <br />

In [6]:
md("*As we can see the RSquare Score is **{}**. Saying that,this modal fits **{}%** which shows that the current modal is a good fit.*".format(sgd_rSquare, math.floor(sgd_rSquare*100)))

*As we can see the RSquare Score is **0.8950972090263911**. Saying that,this modal fits **89%** which shows that the current modal is a good fit.*