<font color='steelblue'>

<span style="font-family:verdana; font-size:1.6em;">
    <strong>Linear Regression Example</strong><br>
    Predict if a startup is going to be profitable<br>
</span>
<span style="font-family:verdana; font-size:1.4em;">
    <b>Following examples are included in the processing:</b><i>
    <ol>
        <li>Load dataset from sklearn datasets</li>
        <li>Explore Data</li>
        <li>Set up the dataframe</li>
        <li>Create training and test dataset</li>
        <li>Build a Linear Regression Model</li>
        <li>Explore trained model performance</li>
        <li>Make predictions using test dataset</li>
        <li>Explore model performance comparing actual v/s predictions</li>
    </ol> </i>   
</span>

</font>

## Import required libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
plt.style.use('seaborn-whitegrid')    # grids in the plots
import warnings
warnings.filterwarnings('ignore')

## Load the dataset from sklearn datasets

In [None]:
df = pd.read_csv('../datasets/50_Startups.csv')

In [None]:
df.head()

## Display the std deviation, mean, min, max, etc of the dataset

In [None]:
# Get the data description e.g. count, mean, standard deviation, etc.
pd.set_option('precision', 2)
df.describe()

In [None]:
df.describe(include = 'object')

In [None]:
df['State'].value_counts()

In [None]:
states = df['State'].values

In [None]:
states

## Display the data types of features and target

In [None]:
# display the data types
df.info()

### Handle State with is categorical value

In [None]:
df = pd.get_dummies(df, columns = ['State'], drop_first = False)
df.head()

In [None]:
# Want our target column at the end (since we have add new columns
# the end of our dataframe from the previous step
profit = df.pop('Profit')

In [None]:
# Add it as the last column in our dataframe
df['Profit'] = profit
df.head()

## Create a scatter plot R&D Spend and Profit

In [None]:
cols = df.columns
cols = cols.drop('Profit')
cols

In [None]:
plt.rc('figure', figsize=(14, 5))
fig, axs = plt.subplots(1, 3)
axs[0].scatter(df['Profit'], df[cols[0]]) 
axs[0].set_title(f'Profilt v/s {cols[0]}')
axs[1].scatter(df['Profit'], df[cols[1]])
axs[1].set_title(f'Profilt v/s {cols[1]}')
axs[2].scatter(df['Profit'], df[cols[2]])
axs[2].set_title(f'Profilt v/s {cols[2]}')

plt.show()

In [None]:
fig, ax = plt.subplots()
ax.scatter( states, df['Profit'])
plt.show()

In [None]:
plt.rc('figure', figsize=(14, 5))
toplot = cols.drop("State_California")
toplot = toplot.drop("State_New York")
toplot = toplot.drop("State_Florida")
print(toplot)
sns.pairplot(df[toplot])
plt.show()

### Check correlation between features and price

In [None]:
# Correlation
corr = df.corr()
sns.set(font_scale=1.4)
f, ax = plt.subplots(figsize=(11, 9))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap = "Blues", vmax=.9, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot = True)
plt.show()

## Display all features "null" count

In [None]:
# check if there are any null values in our features
df.isnull().sum()

## Standardize the features that require scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
tostd = ['R&D Spend', 'Marketing Spend', 'Administration']
df[tostd] = scaler.fit_transform(df[tostd])
df.head()

## Create X and y

In [None]:
cols = list(df.columns)
cols.remove('Profit')
X = df[cols].values

In [None]:
X[:3]

In [None]:
y = df['Profit'].values

## Create Training and Test data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, 
                                                    random_state = 2345)

In [None]:
X_train.shape

In [None]:
X_test.shape

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
# normalize means the regression will apply l2-norm on the data (False is default)
# since we have already standardized the data, leave default
linReg = LinearRegression(normalize = False)

In [None]:
linRegModel = linReg.fit(X_train, y_train)

In [None]:
# intercept on y-axis
linRegModel.intercept_

In [None]:
coeff = list(linRegModel.coef_)

In [None]:
# Sorted dataframe by coefficients
pd.set_option('precision', 2)
coeff_df = pd.DataFrame(coeff, cols, columns=['Coefficient'])  
sortcoeff = coeff_df.sort_values('Coefficient', ascending = False)
sortcoeff

In [None]:
print("R Squared on training data: {}".format(linRegModel.score(X_train, y_train)))

In [None]:
y_pred = linRegModel.predict(X_test)

In [None]:
linRegModel.score(X_test, y_test)

In [None]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df.head(10)

In [None]:
df.plot(kind='bar',figsize=(15,12))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.xlabel('Test Data')
plt.ylabel('Predictions')
plt.title('Predictions using Model')
plt.show()

In [None]:
#test_predictions = model.predict(normed_test_data).flatten()
print(X_test.size)
plt.figure(figsize = (8,8))
a = plt.axes(aspect='equal')
plt.scatter(y_test, y_pred)
plt.xlabel('True Values [$]')
plt.ylabel('Predictions [$]')
# for the line
plt.plot([50000,200000], [50000,200000], 'r')
plt.show()

In [None]:
print("R Squared on predictions: {}".format(r2_score(y_test, y_pred)))

In [None]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared on predictions {}".format(mse))

In [None]:
print("Root Mean Squared on predictions {}".format(np.sqrt(mse)))